Board Game Behavior: An Analysis of Demographics and Gameplay Preferences¶

Documentation¶

Github Repository

Initial Project Proposal (11/8/23)

Milestone One (11/18/23)

Milestone Two(12/3/23)

Data Processing and Modeling¶

In [1]:
!python -m pip install pandas
!python -m pip install openpyxl
!python -m pip install seaborn
!python -m pip install scikit-learn
Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas) (2023.3.post1)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 341.8/341.8 kB 12.0 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.20.3 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas) (1.24.4)
Requirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 109.9 MB/s eta 0:00:00
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2023.3

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 250.0/250.0 kB 8.7 MB/s eta 0:00:00
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from seaborn) (1.24.4)
Requirement already satisfied: pandas>=1.2 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from seaborn) (2.0.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.3 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from seaborn) (3.7.4)
Requirement already satisfied: contourpy>=1.0.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (1.1.1)
Requirement already satisfied: cycler>=0.10 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (4.46.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (1.4.5)
Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (23.2)
Requirement already satisfied: pillow>=6.2.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (10.1.0)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (6.1.1)
Requirement already satisfied: pytz>=2020.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas>=1.2->seaborn) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas>=1.2->seaborn) (2023.3)
Requirement already satisfied: zipp>=3.1.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.3->seaborn) (3.17.0)
Requirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.3->seaborn) (1.16.0)
Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 294.6/294.6 kB 10.6 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.0

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Requirement already satisfied: numpy<2.0,>=1.17.3 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from scikit-learn) (1.24.4)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 34.5/34.5 MB 66.5 MB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.1/11.1 MB 105.6 MB/s eta 0:00:00
Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 302.2/302.2 kB 32.0 MB/s eta 0:00:00
Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.2 scipy-1.10.1 threadpoolctl-3.2.0

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip

Data Cleaning¶

In [2]:
import pandas as pd
import numpy as np

raw_df = pd.read_excel("datasets/rawdata.xlsx",keep_default_na=False)

raw_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 22 columns):
 #   Column                                                                                                                                               Non-Null Count  Dtype         
---  ------                                                                                                                                               --------------  -----         
 0   ID                                                                                                                                                   244 non-null    int64         
 1   Start time                                                                                                                                           244 non-null    datetime64[ns]
 2   Completion time                                                                                                                                      244 non-null    datetime64[ns]
 3   Email                                                                                                                                                244 non-null    object        
 4   Name                                                                                                                                                 244 non-null    object        
 5   Last modified time                                                                                                                                   244 non-null    object        
 6   I am a:                                                                                                                                              244 non-null    object        
 7   What is your Gender Identity?                                                                                                                        244 non-null    object        
 8   What is your Age?
(this field may remain blank)                                                                                                      244 non-null    object        
 9   What is your Race?                                                                                                                                   244 non-null    object        
 10  What is your current Employment Status?                                                                                                              244 non-null    object        
 11  Do you wear glasses or contact lenses for vision correction?                                                                                         244 non-null    object        
 12  What best describes your current religious or spiritual beliefs, if any?
                                                                            244 non-null    object        
 13  What is your Area of Study or Major? 
(this field may remain blank)
                                                                                 244 non-null    object        
 14  Do you enjoy playing board games, card games, or similar tabletop games?                                                                             244 non-null    object        
 15  How many board/card games do you own? 
(All non-roleplay table top games are included, i.e., chess, playing cards, Uno, Magic: The Gathering, etc.)  244 non-null    object        
 16  About how frequently do you play board/card games?                                                                                                   244 non-null    object        
 17  Select what best describes your engagement and style when it comes to board and card games?                                                          244 non-null    object        
 18  I prefer games that include elements of:
(select all that you prefer)                                                                                244 non-null    object        
 19  List up to 15 board/card games that you enjoy playing. 
(abide by the format: "Game 1", "Game 2", "Game 3")
(this field may remain blank)
           244 non-null    object        
 20  On a scale from 1 to 10, how much do you enjoy playing chess, with 1 being not enjoyable at all and 10 being extremely enjoyable?                    244 non-null    object        
 21  Select your all genres of board/card games you enjoy.
(if your preferred genre(s) is/are not listed, please list them in the "other" selection)      244 non-null    object        
dtypes: datetime64[ns](2), int64(1), object(19)
memory usage: 42.1+ KB
Drop Metadata¶
In [3]:
dropped_df = raw_df.drop(columns=["ID","Start time","Completion time","Email","Name","Last modified time"])

dropped_df[:0]
Out[3]:
I am a: What is your Gender Identity? What is your Age?\n(this field may remain blank) What is your Race? What is your current Employment Status? Do you wear glasses or contact lenses for vision correction? What best describes your current religious or spiritual beliefs, if any?\n What is your Area of Study or Major? \n(this field may remain blank)\n Do you enjoy playing board games, card games, or similar tabletop games? How many board/card games do you own? \n(All non-roleplay table top games are included, i.e., chess, playing cards, Uno, Magic: The Gathering, etc.) About how frequently do you play board/card games? Select what best describes your engagement and style when it comes to board and card games? I prefer games that include elements of:\n(select all that you prefer) List up to 15 board/card games that you enjoy playing. \n(abide by the format: "Game 1", "Game 2", "Game 3")\n(this field may remain blank)\n On a scale from 1 to 10, how much do you enjoy playing chess, with 1 being not enjoyable at all and 10 being extremely enjoyable? Select your all genres of board/card games you enjoy.\n(if your preferred genre(s) is/are not listed, please list them in the "other" selection)

Rename Raw Fields from Question to Concise Description¶

In [4]:
renamed_df = dropped_df.rename(columns={
    "I am a:": "WWUStatus",
    "What is your Gender Identity?": "Gender",
    "What is your Age?\n(this field may remain blank)": "Age",
    "What is your Race?": "Race(s)",
    "What is your current Employment Status?": "EmploymentStatus",
    "Do you wear glasses or contact lenses for vision correction?": "Vision",
    "What best describes your current religious or spiritual beliefs, if any?\n": "Religiosity",
    "What is your Area of Study or Major? \n(this field may remain blank)\n": "AOS",
    "Do you enjoy playing board games, card games, or similar tabletop games?": "EnjoysBoardGames",
    "How many board/card games do you own? \n(All non-roleplay table top games are included, i.e., chess, playing cards, Uno, Magic: The Gathering, etc.)": "BoardGamesOwned",
    "About how frequently do you play board/card games?": "FrequencyOfPlay",
    "Select what best describes your engagement and style when it comes to board and card games? ": "Style",
    "I prefer games that include elements of:\n(select all that you prefer)": "PreferredElements",
    "List up to 15 board/card games that you enjoy playing. \n(abide by the format: \"Game 1\", \"Game 2\", \"Game 3\")\n(this field may remain blank)\n": "EnjoyedBoardGames",
    "On a scale from 1 to 10, how much do you enjoy playing chess, with 1 being not enjoyable at all and 10 being extremely enjoyable?": "ChessRating",
    "Select your all genres of board/card games you enjoy.\n(if your preferred genre(s) is/are not listed, please list them in the \"other\" selection)": "EnjoyedGenres"
})

renamed_df[:0]
Out[4]:
WWUStatus Gender Age Race(s) EmploymentStatus Vision Religiosity AOS EnjoysBoardGames BoardGamesOwned FrequencyOfPlay Style PreferredElements EnjoyedBoardGames ChessRating EnjoyedGenres

Multiselect Binary/Boolean Features¶

Race(s)¶
In [5]:
renamed_df["Race(s)"].value_counts()
Out[5]:
Race(s)
White;                                           185
Asian;                                            14
Hispanic or Latino;                                9
White;Asian;                                       6
White;Hispanic or Latino;                          5
Black or African American;White;                   4
White;Black or African American;                   3
White;Native Hawaiian or Pacific Islander;         2
Black or African American;                         2
Asian;Hispanic or Latino;                          2
Black or African American;White;Asian;             1
Black or African American;Hispanic or Latino;      1
White;Native American or American Indian;          1
Multiracial;                                       1
Mixed ethnicity;                                   1
Native American or American Indian;                1
Asian;Filipino;                                    1
Hispanic or Latino;White;                          1
Asian;White;                                       1
White;Black or African American;Asian;             1
Prefer not to say;                                 1
Native American or American Indian;White;          1
Name: count, dtype: int64
In [6]:
raceClean_df = renamed_df.copy()

unique_races = set()
for elements in raceClean_df["Race(s)"].dropna():
    unique_races.update(elements.split(";"))

unique_races = list(unique_races)
unique_races.sort()
In [7]:
# Create a new column for each unique element with binary values (0 or 1 or None)
for element in unique_races:
    raceClean_df[(element + "IsRace").replace(" ", "")] = raceClean_df["Race(s)"].apply(
        lambda x: int(element in x) if pd.notna(x) else None
    )

raceClean_df = raceClean_df.drop(columns={"Race(s)","IsRace"})
PreferredElements¶
In [8]:
raceClean_df["PreferredElements"].value_counts()
Out[8]:
PreferredElements
Conflict/Competition;Strategy;                                                                                                                 7
Cooperation;Conflict/Competition;Luck;Strategy;Social Deduction/Hidden Role;Heavy/Immersive Theming;Puzzle-Solving;Trivia;Party/Low-Stakes;    6
Party/Low-Stakes;                                                                                                                              4
Conflict/Competition;Strategy;Trivia;                                                                                                          4
Cooperation;Conflict/Competition;Luck;Strategy;Puzzle-Solving;Party/Low-Stakes;                                                                3
                                                                                                                                              ..
Cooperation;Conflict/Competition;Strategy;Luck;                                                                                                1
Cooperation;Strategy;Luck;Conflict/Competition;Social Deduction/Hidden Role;Trivia;Party/Low-Stakes;                                           1
Luck;Strategy;Conflict/Competition;Party/Low-Stakes;Trivia;                                                                                    1
Cooperation;Luck;Conflict/Competition;                                                                                                         1
Social Deduction/Hidden Role;Party/Low-Stakes;                                                                                                 1
Name: count, Length: 202, dtype: int64
In [9]:
preferredElementsClean_df = raceClean_df.copy()

unique_preferred_elements = set()
for elements in preferredElementsClean_df["PreferredElements"].dropna():
    unique_preferred_elements.update(elements.split(";"))
unique_preferred_elements

unique_preferred_elements = list(unique_preferred_elements)
unique_preferred_elements.sort()
In [10]:
# Create a new column for each unique element with binary values (0 or 1 or None)
for element in unique_preferred_elements:
    preferredElementsClean_df[(element + "IsPreferredElement").replace(" ", "")] = preferredElementsClean_df["PreferredElements"].apply(
        lambda x: int(element in x) if pd.notna(x) else None
    )

preferredElementsClean_df = preferredElementsClean_df.drop(columns={"PreferredElements","IsPreferredElement"})
EnjoyedGenres¶
In [11]:
enjoyedGenresClean_df = preferredElementsClean_df.copy()

unique_enjoyed_genres = set()
for elements in enjoyedGenresClean_df["EnjoyedGenres"].dropna():
    unique_enjoyed_genres.update(elements.split(";"))
unique_enjoyed_genres

unique_enjoyed_genres = list(unique_enjoyed_genres)
unique_enjoyed_genres.sort()
In [12]:
# Create a new column for each unique element with binary values (0 or 1 or None)
for element in unique_enjoyed_genres:
    enjoyedGenresClean_df[(element + "IsEnjoyedGenre").replace(" ", "")] = enjoyedGenresClean_df["EnjoyedGenres"].apply(
        lambda x: int(element in x) if pd.notna(x) else None
    )

enjoyedGenresClean_df = enjoyedGenresClean_df.drop(columns={"EnjoyedGenres","IsEnjoyedGenre"})
In [13]:
enjoyedGenresClean_df.columns.tolist()
Out[13]:
['WWUStatus',
 'Gender',
 'Age',
 'EmploymentStatus',
 'Vision',
 'Religiosity',
 'AOS',
 'EnjoysBoardGames',
 'BoardGamesOwned',
 'FrequencyOfPlay',
 'Style',
 'EnjoyedBoardGames',
 'ChessRating',
 'AsianIsRace',
 'BlackorAfricanAmericanIsRace',
 'FilipinoIsRace',
 'HispanicorLatinoIsRace',
 'MixedethnicityIsRace',
 'MultiracialIsRace',
 'NativeAmericanorAmericanIndianIsRace',
 'NativeHawaiianorPacificIslanderIsRace',
 'PrefernottosayIsRace',
 'WhiteIsRace',
 'Conflict/CompetitionIsPreferredElement',
 'CooperationIsPreferredElement',
 'Heavy/ImmersiveThemingIsPreferredElement',
 'LuckIsPreferredElement',
 'Party/Low-StakesIsPreferredElement',
 'Puzzle-SolvingIsPreferredElement',
 'SocialDeduction/HiddenRoleIsPreferredElement',
 'StrategyIsPreferredElement',
 'TriviaIsPreferredElement',
 'AbstractStrategyIsEnjoyedGenre',
 'AdventureIsEnjoyedGenre',
 'AnimalsIsEnjoyedGenre',
 'AuctionIsEnjoyedGenre',
 'CardIsEnjoyedGenre',
 'CardDraftingIsEnjoyedGenre',
 'CivilizationIsEnjoyedGenre',
 'Cooperative\xa0IsEnjoyedGenre',
 'Deck-Building\xa0IsEnjoyedGenre',
 'DeductionIsEnjoyedGenre',
 'EconomicIsEnjoyedGenre',
 'EducationalIsEnjoyedGenre',
 'ExplorationIsEnjoyedGenre',
 'FantasyIsEnjoyedGenre',
 'FarmingIsEnjoyedGenre',
 'FightingIsEnjoyedGenre',
 'HorrorIsEnjoyedGenre',
 'LuckIsEnjoyedGenre',
 'MedievalIsEnjoyedGenre',
 'MemoryIsEnjoyedGenre',
 'MiniaturesIsEnjoyedGenre',
 'Party\xa0IsEnjoyedGenre',
 'PiratesIsEnjoyedGenre',
 'PoliticalIsEnjoyedGenre',
 'PuzzleIsEnjoyedGenre',
 'RacingIsEnjoyedGenre',
 'Role-Playing\xa0IsEnjoyedGenre',
 'RollandMove\xa0IsEnjoyedGenre',
 'ScienceFictionIsEnjoyedGenre',
 'SocialDeduction/HiddenRoleIsEnjoyedGenre',
 'SportsIsEnjoyedGenre',
 'StrategyIsEnjoyedGenre',
 'TerritoryBuildingIsEnjoyedGenre',
 'Tile-Laying\xa0IsEnjoyedGenre',
 'TrainsIsEnjoyedGenre',
 'TransportationIsEnjoyedGenre',
 'TravelIsEnjoyedGenre',
 'TriviaIsEnjoyedGenre',
 'War\xa0IsEnjoyedGenre',
 'Word\xa0IsEnjoyedGenre',
 'WorkerPlacementIsEnjoyedGenre',
 'WorldWarIIIsEnjoyedGenre',
 'ZombiesIsEnjoyedGenre']
In [14]:
reformatColumns_df = enjoyedGenresClean_df.rename(columns=lambda x: x.replace("\xa0", ""))

reformatColumns_df.columns.tolist()
Out[14]:
['WWUStatus',
 'Gender',
 'Age',
 'EmploymentStatus',
 'Vision',
 'Religiosity',
 'AOS',
 'EnjoysBoardGames',
 'BoardGamesOwned',
 'FrequencyOfPlay',
 'Style',
 'EnjoyedBoardGames',
 'ChessRating',
 'AsianIsRace',
 'BlackorAfricanAmericanIsRace',
 'FilipinoIsRace',
 'HispanicorLatinoIsRace',
 'MixedethnicityIsRace',
 'MultiracialIsRace',
 'NativeAmericanorAmericanIndianIsRace',
 'NativeHawaiianorPacificIslanderIsRace',
 'PrefernottosayIsRace',
 'WhiteIsRace',
 'Conflict/CompetitionIsPreferredElement',
 'CooperationIsPreferredElement',
 'Heavy/ImmersiveThemingIsPreferredElement',
 'LuckIsPreferredElement',
 'Party/Low-StakesIsPreferredElement',
 'Puzzle-SolvingIsPreferredElement',
 'SocialDeduction/HiddenRoleIsPreferredElement',
 'StrategyIsPreferredElement',
 'TriviaIsPreferredElement',
 'AbstractStrategyIsEnjoyedGenre',
 'AdventureIsEnjoyedGenre',
 'AnimalsIsEnjoyedGenre',
 'AuctionIsEnjoyedGenre',
 'CardIsEnjoyedGenre',
 'CardDraftingIsEnjoyedGenre',
 'CivilizationIsEnjoyedGenre',
 'CooperativeIsEnjoyedGenre',
 'Deck-BuildingIsEnjoyedGenre',
 'DeductionIsEnjoyedGenre',
 'EconomicIsEnjoyedGenre',
 'EducationalIsEnjoyedGenre',
 'ExplorationIsEnjoyedGenre',
 'FantasyIsEnjoyedGenre',
 'FarmingIsEnjoyedGenre',
 'FightingIsEnjoyedGenre',
 'HorrorIsEnjoyedGenre',
 'LuckIsEnjoyedGenre',
 'MedievalIsEnjoyedGenre',
 'MemoryIsEnjoyedGenre',
 'MiniaturesIsEnjoyedGenre',
 'PartyIsEnjoyedGenre',
 'PiratesIsEnjoyedGenre',
 'PoliticalIsEnjoyedGenre',
 'PuzzleIsEnjoyedGenre',
 'RacingIsEnjoyedGenre',
 'Role-PlayingIsEnjoyedGenre',
 'RollandMoveIsEnjoyedGenre',
 'ScienceFictionIsEnjoyedGenre',
 'SocialDeduction/HiddenRoleIsEnjoyedGenre',
 'SportsIsEnjoyedGenre',
 'StrategyIsEnjoyedGenre',
 'TerritoryBuildingIsEnjoyedGenre',
 'Tile-LayingIsEnjoyedGenre',
 'TrainsIsEnjoyedGenre',
 'TransportationIsEnjoyedGenre',
 'TravelIsEnjoyedGenre',
 'TriviaIsEnjoyedGenre',
 'WarIsEnjoyedGenre',
 'WordIsEnjoyedGenre',
 'WorkerPlacementIsEnjoyedGenre',
 'WorldWarIIIsEnjoyedGenre',
 'ZombiesIsEnjoyedGenre']

AOS (manual string bucketing)¶

In [15]:
aos_df = reformatColumns_df
aos_df["AOS"].str.lower().unique()
Out[15]:
array(['computer science ', 'data science ', 'data science', 'eece',
       'electrical and computer engineering ', 'statistics',
       'environmental studies', 'applied mathematics', 'chemistry', 'rml',
       'political science', 'elementary education', 'english',
       'music education', 'n/a', 'art', 'psychology', '',
       'psych (probably)', 'music', 'environmental science - toxicology',
       'history/museum studies', 'elementary ed',
       'environmental science ', 'mathematics ', 'business', 'biochem ',
       'vocal performance ', 'secondary education ', 'business ',
       'linguistics', 'history', 'bio/anth', 'mathematics',
       'marine biology ', 'environmental science',
       'communication disorders', 'engineering ', 'biochem',
       'kinesiology', 'economics and mathematics',
       'music education and german', 'art p-12',
       'chemistry either organic or inorganic', 'math',
       'electrical engineering',
       'undecided but leaning towards engineering', 'medicine ',
       'rec management ', 'economics ', 'geology', 'visual journalism ',
       'environmental studies ', 'biology/math',
       'behavioral neuroscience', 'electrical engineering ',
       'computer science', 'geology (paleoclimate)',
       'marine biology and theater production', 'anthropology',
       'biology ', 'management information systems ', 'marine bio',
       'history/holocaust & genocide studies', 'sped & eled',
       'visual journalism', 'anthropology, communication studies',
       'theatre', 'studio art',
       'urban planning and sustainable development',
       'urban planning and sustainable development ', 'history ',
       'art and design ', 'kinesiology ', 'spanish ', 'biochemistry ',
       'art studio', 'art ed', 'comm', 'early childhood education ',
       'creative writing', 'neuroscience ', 'marine science ',
       'marketing ', 'behavioral neuroscience ', 'pre nursing ',
       'engineering', 'graphic design', 'undecided',
       'english literature with a teaching emphasis',
       'political science ', 'international business ',
       'communication studies', 'dance',
       'narrative and folklore studies (fairhaven major) ', 'psychology ',
       'anthropology ', 'pre med and psychology ', 'biology',
       'education and public relations', 'economics/mathematics',
       'communications', 'art studio (ba), art history',
       'elementary education ', 'archaeology ', 'theatre/education',
       'marketing', 'business and sustainability ', 'biochemistry',
       'environmental studies: eco-social justice and education emphasis',
       'education ', 'education', 'envs ',
       'mathematics secondary education', 'music composition',
       'sociology ', 'stem', 'linguistics ', 'fairhaven', 'fairhaven ',
       'behavioural neuroscience', 'english lit',
       'food equity and sustainable agriculture ',
       'art history and museum studies', 'japanese language ',
       'graphic design and marketing ', 'music performance major',
       'environment studies', 'business or elementary education ',
       'marine and coastal science',
       'undeclared, strongly thinking about history ', 'public health',
       'energy policy and management ', 'undeclared', 'fine arts',
       'undecided ', 'english, history of culture ',
       'psychology and elementary education ',
       'communication science and disordwrs', 'anthropology/history',
       'special education and elementary education ', 'ibus',
       'energy science', 'politics/philosophy/economics', 'studio art ',
       'history/social studies', 'energy'], dtype=object)
In [16]:
manual_mapping_aos = {
    "computer science": "STEM",
    "data science": "STEM",
    "eece": "STEM",
    "electrical and computer engineering": "STEM",
    "statistics": "STEM",
    "environmental studies": "STEM",
    "applied mathematics": "STEM",
    "chemistry": "STEM",
    "rml": "Other",
    "political science": "Social Studies",
    "elementary education": "Education",
    "english": "Arts & Humanities",
    "music education": "Arts & Humanities",
    "nan": "Other",
    "art": "Arts & Humanities",
    "psychology": "Social Studies",
    "psych (probably)": "Social Studies",
    "music": "Arts & Humanities",
    "environmental science - toxicology": "STEM",
    "history/museum studies": "Arts & Humanities",
    "elementary ed": "Education",
    "environmental science": "STEM",
    "mathematics": "STEM",
    "business": "Business",
    "biochem": "STEM",
    "vocal performance": "Arts & Humanities",
    "secondary education": "Education",
    "linguistics": "Arts & Humanities",
    "history": "Arts & Humanities",
    "bio/anth": "STEM",
    "marine biology": "STEM",
    "communication disorders": "Health & Medicine",
    "engineering": "STEM",
    "kinesiology": "Health & Medicine",
    "economics and mathematics": "STEM",
    "music education and german": "Arts & Humanities",
    "art p-12": "Arts & Humanities",
    "chemistry either organic or inorganic": "STEM",
    "math": "STEM",
    "electrical engineering": "STEM",
    "undecided but leaning towards engineering": "Unknown",  # Assuming lean towards STEM, but no exact match
    "medicine": "Health & Medicine",
    "rec management": "Other",  # Assuming Recreation Management
    "economics": "Social Studies",  # Close to "economics and mathematics", but economics is often considered Social Studies
    "geology": "STEM",
    "geology (paleoclimate)": "STEM",
    "visual journalism": "Arts & Humanities",
    "biology/math": "STEM",  # Combination of two STEM fields
    "behavioral neuroscience": "STEM",  # Close to "psychology" which is Social Studies, but has a heavy STEM component
    "marine biology and theater production": "STEM",  # Marine biology is STEM, theater production could be Arts, but STEM is the primary
    "anthropology": "Social Studies",
    "biology": "STEM",
    "management information systems": "Business",
    "marine bio": "STEM",
    "history/holocaust & genocide studies": "Arts & Humanities",
    "sped & eled": "Education",  # Assuming this refers to special education & elementary education
    "anthropology, communication studies": "Social Studies",
    "theatre": "Arts & Humanities",
    "studio art": "Arts & Humanities",
    "urban planning and sustainable development": "Other",  # Not a clear category, could be Social Studies or another category
    "art and design": "Arts & Humanities",
    "spanish": "Arts & Humanities",  # Language studies are often classified here
    "biochemistry": "STEM",
    "art studio": "Arts & Humanities",
    "art ed": "Arts & Humanities",
    "comm": "Other",  # Assuming "communication", but not explicitly listed, could fit Social Studies or Business,
    
    "environmental studies: eco-social justice and education emphasis": "STEM",  # Falls under Environmental Studies
    "communications": "Other",  # Often classified as Arts & Humanities
    "theatre/education": "Education",  # Falls under Education
    "undecided": "Unknown",  # Assuming still undecided as before
    "marketing": "Business",  # Falls under Business
    "communication studies": "Arts & Humanities",  # Often classified as Arts & Humanities
    "sociology": "Social Studies",  # Social Studies
    "education and public relations": "Education",  # Falls under Education
    "pre nursing": "Health & Medicine",  # Falls under Health & Medicine
    "economics/mathematics": "STEM",  # Combination of Economics and Mathematics - falls under STEM
    "mathematics secondary education": "Education",  # Falls under Education
    "dance": "Arts & Humanities",  # Often classified as Arts & Humanities
    "art studio (ba), art history": "Arts & Humanities",  # Falls under Arts & Humanities
    "narrative and folklore studies (fairhaven major)": "Arts & Humanities",  # Falls under Arts & Humanities
    "pre med and psychology": "Health & Medicine",  # Falls under Health & Medicine
    "archaeology": "Social Studies",  # Falls under Social Studies
    "neuroscience": "STEM",  # Falls under STEM
    "english literature with a teaching emphasis": "Arts & Humanities",  # Falls under Arts & Humanities
    "marine science": "STEM",  # Falls under STEM
    "fairhaven": "Other",  # Falls under Other
    "international business": "Business",  # Falls under Business
    "music composition": "Arts & Humanities",  # Falls under Arts & Humanities
    "creative writing": "Arts & Humanities",  # Falls under Arts & Humanities
    "business and sustainability": "Business",  # Falls under Business
    "early childhood education": "Education",  # Falls under Education
    "graphic design": "Arts & Humanities",  # Falls under Arts & Humanities
    "education": "Education",  # Falls under Education
    "stem": "STEM",  # Falls under STEM
    "envs": "STEM",  # Falls under STEM

    
    "behavioural neuroscience": "STEM",
    "english lit": "Arts & Humanities",
    "food equity and sustainable agriculture": "Other",  # Could be Social Studies, Business, or even STEM, unclear
    "art history and museum studies": "Arts & Humanities",
    "japanese language": "Arts & Humanities",
    "graphic design and marketing": "Arts & Humanities",  # Graphic Design is often in Arts & Humanities, Marketing is Business, but first seems primary
    "music performance major": "Arts & Humanities",
    "environment studies": "STEM",
    "business or elementary education": "Unknown",  # Could be either Business or Education 
    "marine and coastal science": "STEM",
    "undeclared, strongly thinking about history": "Unknown",  # Assuming lean towards Arts & Humanities, but no exact match
    "public health": "Health & Medicine",
    "energy policy and management": "Other",  # Could be Business, Social Studies, or STEM
    "undeclared": "Unknown",
    "fine arts": "Arts & Humanities",
    "english, history of culture": "Arts & Humanities",
    "psychology and elementary education": "Education",  # Both Psychology and Elementary Education could be Education
    "communication science and disordwrs": "Health & Medicine",
    "anthropology/history": "Social Studies",
    "special education and elementary education": "Education",
    "ibus": "Business",  # Assuming International Business
    "energy science": "STEM",
    "politics/philosophy/economics": "Social Studies",  # Combination of three Social Studies fields
    "history/social studies": "Social Studies",
    "energy": "STEM"  # Energy could be a part of STEM disciplines like Physics or Environmental Sciences
}


noncategorized_data = []

def categorize_aos_string(aos):
    if pd.isnull(aos) or aos == "n/a" or aos == "" or aos == "N/A":
        return "Unknown"  # For handling NaN values
    field_clean = aos.lower().rstrip()
    for key, category in manual_mapping_aos.items():
        if key == field_clean:
            return category
    noncategorized_data.append(field_clean)    


aos_df["AOSCat"] = aos_df["AOS"].apply(categorize_aos_string)

if (len(noncategorized_data) > 0):
    raise ValueError(f"Unknown categories: {noncategorized_data}") 

aos_df["AOSCat"].unique()
Out[16]:
array(['STEM', 'Other', 'Social Studies', 'Education',
       'Arts & Humanities', 'Unknown', 'Business', 'Health & Medicine'],
      dtype=object)

Shorting the Length of Single Select Answer Choices for "Style"¶

In [17]:
style_df = aos_df 

style_df["Style"].unique()
Out[17]:
array(['Not interested in playing board/card games.',
       'Party-only player, primarily playing board/card games at social gatherings or parties.',
       'Strategy-focused player, enjoying games that require planning and tactics, but also participates in more casual games.',
       'Situation-Specific Player, adapts style and enthusiasm based on the specific game or social context. May be casual in some situations and highly strategic in others, depending on the game being played.',
       'None of these describe me.',
       'Casual player, participating for fun and relaxation, without a strong focus on winning. Still willing to engage with complex games.',
       'Simple and straightforward player, preferring uncomplicated games with easy rules.'],
      dtype=object)
In [18]:
player_styles_mapping = {
    "Strategy-focused player, enjoying games that require planning and tactics, but also participates in more casual games.": "Strategic",
    "Situation-Specific Player, adapts style and enthusiasm based on the specific game or social context. May be casual in some situations and highly strategic in others, depending on the game being played.": "Situation-Specific",
    "Casual player, participating for fun and relaxation, without a strong focus on winning. Still willing to engage with complex games.": "Casual",
    "Simple and straightforward player, preferring uncomplicated games with easy rules.": "Simple",
    "Party-only player, primarily playing board/card games at social gatherings or parties.": "Party/Social",
    "Not interested in playing board/card games.": "Not Interested",
    "Never played or never had the opportunity to play board/card games.": "Never Played",
    "None of these describe me.": "Other",
    "Prefer not to say": "Prefer not to say"
}

def map_player_styles(string):
    for key, val in player_styles_mapping.items():
        if key in string:
            return val
    raise ValueError(f"Unknown category: {string}") 

new_style_df = style_df.copy()

new_style_df["Style"] = style_df["Style"].apply(map_player_styles)

new_style_df["Style"].unique()
Out[18]:
array(['Not Interested', 'Party/Social', 'Strategic',
       'Situation-Specific', 'Other', 'Casual', 'Simple'], dtype=object)

Assigning Order to Particular Catagories¶

In [19]:
cat_order_df = new_style_df.copy()

cat_order_df["BoardGamesOwned"] = cat_order_df["BoardGamesOwned"].apply(lambda x: "0" if x == "I do not own any board/card games.\xa0" else x)

cat_order_df["BoardGamesOwned"] = pd.Categorical(cat_order_df["BoardGamesOwned"],
    categories=["Prefer not to say","0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],ordered=True)


cat_order_df["FrequencyOfPlay"] = pd.Categorical(cat_order_df["FrequencyOfPlay"],
    categories=["Daily","Several times a week","Weekly","Several times a month","Monthly","Every few months","Rarely/Seldom","Never","Prefer not to say",],ordered=True)


cat_order_df["ChessRating"] = cat_order_df["ChessRating"].apply(lambda x: "Unknown" if x == "" else x)

cat_order_df["ChessRating"] = pd.Categorical(cat_order_df["ChessRating"].astype(str),
    categories=["Unknown","1","2","3","4","5","6","7","8","9","10"], ordered=True)

Convert Features to Int¶

In [20]:
convert_df = cat_order_df.copy()
convert_df["Age"] = convert_df["Age"].apply(lambda x: -1 if x == "" else x)
convert_df["Age"] = convert_df["Age"].astype(int)
In [21]:
cleaned_df = convert_df

cleaned_df.to_csv("datasets/cleaned.csv")

Exploratory Data Analysis¶

In [22]:
single_select_features = ['WWUStatus', 'Gender', 'Age', 'EmploymentStatus', 'Vision',
       'Religiosity', 'AOSCat', 'EnjoysBoardGames', 'BoardGamesOwned',
       'FrequencyOfPlay', 'Style', 'ChessRating']

race_bool_features = ['WhiteIsRace','AsianIsRace', 'BlackorAfricanAmericanIsRace', 'FilipinoIsRace',
       'HispanicorLatinoIsRace','MixedethnicityIsRace', 'MultiracialIsRace', 'NativeAmericanorAmericanIndianIsRace',
       'NativeHawaiianorPacificIslanderIsRace', 'PrefernottosayIsRace',]
       
element_bool_features = ['Conflict/CompetitionIsPreferredElement',
       'CooperationIsPreferredElement',
       'Heavy/ImmersiveThemingIsPreferredElement', 'LuckIsPreferredElement',
       'Party/Low-StakesIsPreferredElement',
       'Puzzle-SolvingIsPreferredElement',
       'SocialDeduction/HiddenRoleIsPreferredElement',
       'StrategyIsPreferredElement', 'TriviaIsPreferredElement',]

genre_bool_features = ['AbstractStrategyIsEnjoyedGenre', 'AdventureIsEnjoyedGenre',
       'AnimalsIsEnjoyedGenre', 'AuctionIsEnjoyedGenre', 'CardIsEnjoyedGenre',
       'CardDraftingIsEnjoyedGenre', 'CivilizationIsEnjoyedGenre',
       'CooperativeIsEnjoyedGenre', 'Deck-BuildingIsEnjoyedGenre',
       'DeductionIsEnjoyedGenre', 'EconomicIsEnjoyedGenre',
       'EducationalIsEnjoyedGenre', 'ExplorationIsEnjoyedGenre',
       'FantasyIsEnjoyedGenre', 'FarmingIsEnjoyedGenre',
       'FightingIsEnjoyedGenre', 'HorrorIsEnjoyedGenre', 'LuckIsEnjoyedGenre',
       'MedievalIsEnjoyedGenre', 'MemoryIsEnjoyedGenre',
       'MiniaturesIsEnjoyedGenre', 'PartyIsEnjoyedGenre',
       'PiratesIsEnjoyedGenre', 'PoliticalIsEnjoyedGenre',
       'PuzzleIsEnjoyedGenre', 'RacingIsEnjoyedGenre',
       'Role-PlayingIsEnjoyedGenre', 'RollandMoveIsEnjoyedGenre',
       'ScienceFictionIsEnjoyedGenre',
       'SocialDeduction/HiddenRoleIsEnjoyedGenre', 'SportsIsEnjoyedGenre',
       'StrategyIsEnjoyedGenre', 'TerritoryBuildingIsEnjoyedGenre',
       'Tile-LayingIsEnjoyedGenre', 'TrainsIsEnjoyedGenre',
       'TransportationIsEnjoyedGenre', 'TravelIsEnjoyedGenre',
       'TriviaIsEnjoyedGenre', 'WarIsEnjoyedGenre', 'WordIsEnjoyedGenre',
       'WorkerPlacementIsEnjoyedGenre', 'WorldWarIIIsEnjoyedGenre',
       'ZombiesIsEnjoyedGenre',]

free_form_features = ['AOS','EnjoyedBoardGames']
In [23]:
len(cleaned_df.columns)
Out[23]:
76
In [24]:
len(single_select_features + race_bool_features + element_bool_features + genre_bool_features + free_form_features)
Out[24]:
76
In [25]:
cleaned_df["Religiosity"].value_counts()
Out[25]:
Religiosity
Atheism                                                        59
No specific belief                                             55
Agnosticism                                                    49
Christianity                                                   32
Spiritual, not affiliated with a specific religion             31
Judaism                                                         4
Prefer not to say                                               2
Pagan                                                           1
Islam                                                           1
Toaism                                                          1
Lutheran                                                        1
Hinduism                                                        1
Buddhism                                                        1
pagan                                                           1
Unitarian                                                       1
Science                                                         1
being with oneself in connection to everything                  1
Paganism                                                        1
I believe a god exists but don’t follow any religious texts     1
Name: count, dtype: int64
In [26]:
religious_bucketing = {
    "Christianity": "Religious",
    "Judaism": "Religious",
    "Islam": "Religious",
    "Hinduism": "Religious",
    "Buddhism": "Religious",
    "Lutheran": "Religious",
    "Pagan": "Religious",
    "Paganism": "Religious",
    "Unitarian": "Religious",
    "I believe a god exists but don’t follow any religious texts": "Religious",
    "Toaism": "Religious",
    "pagan": "Religious",

    "Atheism": "Not Religious",
    "No specific belief": "Not Religious",
    "Agnosticism": "Not Religious",
    "Spiritual, not affiliated with a specific religion": "Not Religious",
    "Science": "Not Religious",
    "being with oneself in connection to everything": "Not Religious",
    "Prefer not to say": "Not Religious",
}

Defining Functions for Analyzing Categorical Data¶

In [27]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import pandas as pd



def categories_against_category(df, categories, category):
    for feature in categories:
        print(df.groupby(category)[feature].value_counts())
        # Compute percentage for each category within each cluster
        df_percent = df.groupby(category)[feature].value_counts(normalize=True).rename('Percentage').reset_index()
        df_percent["Percentage"] *= 100  # Convert to %

        # Create seaborn barplot
        plt.figure(figsize=(8, 6))
        sns.barplot(x=feature, y='Percentage', hue=category, data=df_percent)
        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
        plt.xticks(rotation=90)
        plt.show()

def bools_against_category(df,bools,category):
    # Initialize an empty dataframe to store your percentages
    percentages = pd.DataFrame()

    # Loop through the bool_columns 
    for col in bools:
        # Compute the percentages of True (=1) occurrences in each cluster
        percents = df.groupby(category,observed=True)[col].mean().mul(100).reset_index()
        percents['Feature'] = col
        percents.rename({col: 'Percentage'}, axis=1, inplace=True)

        # Append the computed percentages to your dataframe
        percentages = pd.concat([percentages, percents])

    # Once your dataframe is ready, you can plot it using seaborn
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Feature', y='Percentage', hue=category, data=percentages)
    plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
    plt.xticks(rotation=90)
    plt.title(f"Percentage of 'True' Occurrences in Each Feature by {category}")
    plt.show()

# Intended for single-select/categorical feature comparison
# `x` should be a Panda Series
# `y` should be a Panda Series
def plot_count_and_percentage_heatmaps(x, y):
    data = pd.DataFrame({x.name: x, y.name: y})

    count_matrix = pd.crosstab(data[x.name], data[y.name])
    
    percent_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0) * 100

    fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # Heatmap for counts
    sns.heatmap(count_matrix, annot=True, fmt="d", cmap="YlGnBu", cbar=True, ax=ax[0])
    ax[0].set_title("Count Matrix of Categories")
    ax[0].set_xlabel(y.name)
    ax[0].set_ylabel(x.name)

    # Heatmap for percentages
    sns.heatmap(percent_matrix, annot=True, fmt=".2f", cmap="YlGnBu", cbar=True, ax=ax[1],vmin=0.0, vmax=100.0)
    ax[1].set_title("Percentage Matrix of Categories")
    ax[1].set_xlabel(y.name)
    ax[1].set_ylabel(x.name)

    plt.tight_layout()
    plt.show()

# Intended for categorical feature comparison against several bool fields
# `data` should be a Pandas Data Frame
# `x` should be a string that refers to the categorical feature in `data`
# `y` should be an array of strings that refer to the several bool features in `data`
def plot_heatmap_of_bool_features_percent(data, x, y):
    df_melted = data.melt(id_vars=x, value_vars=y, var_name="Bool", value_name="True")

    df_pivot = df_melted.groupby([x,"Bool"]).sum().reset_index().pivot(index=x, columns="Bool", values="True")

    total = data[x].value_counts()

    df_pivot = df_pivot.join(total)
    

    df_pivot.loc[:, df_pivot.columns != "count"] = ((df_pivot.loc[:, df_pivot.columns != "count"].div(df_pivot["count"], axis=0)) * 100).round(4)
    
    print(df_pivot["count"])
    df_pivot = df_pivot.drop(columns=["count"])
    
    df_pivot = df_pivot.transpose()

    sns.heatmap(df_pivot, annot=True, cmap="Blues", fmt=".2f",vmin=0.0, vmax=100.0)

# Intended for categorical feature comparison against several bool fields
# `data` should be a Pandas Data Frame
# `x` should be a string that refers to the categorical feature in `data`
# `y` should be an array of strings that refer to the several bool features in `data`
def plot_heatmap_of_bool_features_val_count(data, x, y):
    df_melted = data.melt(id_vars=x, value_vars=y, var_name="Bool", value_name="True")

    df_pivot = df_melted.groupby([x,"Bool"]).sum().reset_index().pivot(index=x, columns="Bool", values="True")

    total = data[x].value_counts()

    df_pivot = df_pivot.join(total)
    
    df_pivot.rename(columns={"count": "UniqueMembersOfParticular" + x}, inplace=True)
    
    df_pivot = df_pivot.transpose()
    # Plot
    sns.heatmap(df_pivot, annot=True, cmap="Blues", fmt=".2f")

# Used to reduce the number of categories in a feature by setting the value to "Other" for insufficient response quantity of specific catagories
# `df` should be a Pandas Data Frame
# `col_name` should be the string name of the column/feature you'd like to filter
# `threshold` should be the minimum count needed to maintain the categorical variable 

# For example, if `Age` gets a bunch of responses between 18 and 24, and you get a few 40s and one 60. You can use this function to reduce the 
# different categorical variables to simply 18 through 24 and "Other"
def filter_threshold(df, col_name, threshold):
    df_copy = df.copy() 
    counts = df_copy[col_name].value_counts()
    df_copy.loc[df_copy[col_name].isin(counts[counts < threshold].index), col_name] = "Other"
    return df_copy

Distribution of Features¶

The following code is used to filter out feature responses that are underrepresented or minimal in survey data, thus preventing inaccurate conclusions that could arise from only looking at a small sample of a population. Most of features graphed below have at least two prominent catagories for which to perform data analysis, which is sufficient. This includes:

  • Gender
  • Age (within the range of 18 to 24)
  • Employment Status (at least for Part-Time and Unemployed)
  • Vision
  • Area of Study Catagories
  • Number of Board Games Owned
  • Frequency of Play
  • Style
  • Chess Rating

However, a few fields did not get sufficient diverse responses to conclude anything, this includes:

  • WWU Status
    • Nearly everyone who took this survey was a student
  • Religiosity
    • Most people who filled out this survey were secular, and the largest minority religious group is simply too small to make any meaningful conclusions
  • Does person enjoys board games
    • The overwhelming majority of people who filled out this survey liked playing board games making relationships between demographics hard to draw
In [28]:
for feature in single_select_features:
    sns.countplot(x=feature, data=cleaned_df)
    plt.xticks(rotation=90) 
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Distribution of Features (cont'd)¶

The following questions were multi-select and resulted in bool fields. Because of this, we need to display the data slightly differently: totalling all "True"/selected values from the related features.

The multi-select questions were in regard to Race, Preferred Gameplay Elements, and Enjoyed Genres.

Preferred Gameplay Elements and Enjoyed Genres got a sufficient distribution of responses, however, Race did not receive sufficiently diverse responses for analysis.

In [29]:
true_counts = {}

for col in race_bool_features:
    true_counts[col] = cleaned_df[cleaned_df[col] == 1].shape[0]

true_counts_series = pd.Series(true_counts)

sns.barplot(true_counts_series)
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [30]:
true_counts = {}

for col in element_bool_features:
    true_counts[col] = cleaned_df[cleaned_df[col] == 1].shape[0]

true_counts_series = pd.Series(true_counts)

sns.barplot(true_counts_series)
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [31]:
true_counts = {}

for col in genre_bool_features:
    true_counts[col] = cleaned_df[cleaned_df[col] == 1].shape[0]

true_counts_series = pd.Series(true_counts)

sns.barplot(true_counts_series)
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image

(Random) Exploratory Analysis¶

This section is dedicated to graphing random sufficient features against one another in hopes of uncovering hidden relationships.

In [32]:
# There were many different responses for various types of non-binary gender identities, because of this, graphing the data becomes a bit muddled
# This is solved by grouping the non-binary responses into an "Other" category
genderForAnalysis_df = filter_threshold(cleaned_df,"Gender",25)
genderForAnalysis_df["Gender"].value_counts()
Out[32]:
Gender
Woman    109
Man      101
Other     34
Name: count, dtype: int64
In [33]:
genderForAnalysis_df = genderForAnalysis_df[genderForAnalysis_df["Gender"] != "Other"]
In [34]:
categories_against_category(genderForAnalysis_df,single_select_features,"Gender")
Gender  WWUStatus         
Man     WWU Student            98
        WWU Faculty Member      2
        Neither                 1
Woman   WWU Student           108
        WWU Faculty Member      1
Name: count, dtype: int64
No description has been provided for this image
Gender
Man      101
Woman    109
Name: count, dtype: int64
No description has been provided for this image
Gender  Age
Man      18    28
         19    23
         20    16
         21    11
         22     6
         23     6
        -1      5
         26     3
         24     2
         25     1
Woman    18    29
         19    23
         20    21
         21    15
         22     8
         23     4
         24     4
        -1      1
         25     1
         28     1
         29     1
         36     1
Name: count, dtype: int64
No description has been provided for this image
Gender  EmploymentStatus                 
Man     Unemployed                           51
        Employed part-time                   45
        Employed full-time                    2
        Seasonal worker during the summer     1
        Seeking employment                    1
        Self-employed                         1
Woman   Unemployed                           53
        Employed part-time                   50
        Prefer not to say                     2
        Employed full-time                    1
        One day a week babysitting            1
        Seasonally employed                   1
        Self-employed                         1
Name: count, dtype: int64
No description has been provided for this image
Gender  Vision           
Man     None                 61
        Glasses              27
        Both                  7
        Contacts              5
        Prefer not to say     1
Woman   None                 55
        Glasses              31
        Both                 18
        Contacts              5
Name: count, dtype: int64
No description has been provided for this image
Gender  Religiosity                                                
Man     Agnosticism                                                    26
        Atheism                                                        25
        No specific belief                                             24
        Christianity                                                   12
        Spiritual, not affiliated with a specific religion              6
        Judaism                                                         3
        Toaism                                                          1
        Science                                                         1
        Prefer not to say                                               1
        Islam                                                           1
        Buddhism                                                        1
Woman   No specific belief                                             26
        Atheism                                                        22
        Christianity                                                   20
        Agnosticism                                                    19
        Spiritual, not affiliated with a specific religion             17
        I believe a god exists but don’t follow any religious texts     1
        Lutheran                                                        1
        Judaism                                                         1
        Hinduism                                                        1
        Unitarian                                                       1
Name: count, dtype: int64
No description has been provided for this image
Gender  AOSCat           
Man     STEM                 48
        Arts & Humanities    16
        Unknown              14
        Social Studies        7
        Business              6
        Education             5
        Other                 3
        Health & Medicine     2
Woman   STEM                 35
        Arts & Humanities    21
        Unknown              16
        Education            10
        Health & Medicine     8
        Social Studies        8
        Other                 7
        Business              4
Name: count, dtype: int64
No description has been provided for this image
Gender  EnjoysBoardGames 
Man     Yes                   93
        No                     8
Woman   Yes                  106
        No                     2
        Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Gender  BoardGamesOwned  
Man     2 to 5               29
        1 or 2               24
        5 to 10              24
        0                    10
        10 to 20              7
        More than 20          7
        Prefer not to say     0
Woman   5 to 10              34
        2 to 5               31
        1 or 2               27
        10 to 20              9
        0                     4
        More than 20          4
        Prefer not to say     0
Name: count, dtype: int64
No description has been provided for this image
Gender  FrequencyOfPlay      
Man     Several times a month    24
        Every few months         18
        Rarely/Seldom            17
        Weekly                   14
        Monthly                  13
        Several times a week     10
        Never                     3
        Daily                     2
        Prefer not to say         0
Woman   Every few months         28
        Several times a month    27
        Monthly                  21
        Weekly                   12
        Rarely/Seldom             9
        Several times a week      7
        Daily                     4
        Never                     1
        Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Gender  Style             
Man     Situation-Specific    38
        Casual                31
        Strategic             20
        Party/Social           6
        Not Interested         3
        Simple                 2
        Other                  1
Woman   Situation-Specific    43
        Casual                41
        Strategic             15
        Party/Social           6
        Simple                 4
Name: count, dtype: int64
No description has been provided for this image
Gender  ChessRating
Man     7              25
        8              17
        6              12
        4              11
        3               8
        5               8
        1               8
        9               4
        10              4
        2               4
        Unknown         0
Woman   1              28
        5              14
        2              14
        7              12
        4              10
        6               9
        8               8
        3               7
        Unknown         3
        10              3
        9               1
Name: count, dtype: int64
No description has been provided for this image
In [35]:
bools_against_category(genderForAnalysis_df,race_bool_features,"Gender")
bools_against_category(genderForAnalysis_df,element_bool_features,"Gender")
bools_against_category(genderForAnalysis_df,genre_bool_features,"Gender")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

All Single Select Against ChessRating Box Plot

There does appear to be a substantial difference in ratings between Men and Women within in the data.

In [36]:
genderForAnalysis_df["Gender"].value_counts()
Out[36]:
Gender
Woman    109
Man      101
Name: count, dtype: int64
In [37]:
sns.histplot(x=cleaned_df["ChessRating"])
Out[37]:
<Axes: xlabel='ChessRating', ylabel='Count'>
No description has been provided for this image
In [38]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"Gender",30)
modified_df = filter_threshold(modified_df,"Age",10)
modified_df = filter_threshold(modified_df,"EmploymentStatus",30)
modified_df["Religiosity"] = modified_df["Religiosity"].map(religious_bucketing)
modified_df = filter_threshold(modified_df,"AOSCat",30)
modified_df["FrequencyOfPlay"] = modified_df["FrequencyOfPlay"].apply(lambda x: "At Least Weekly" if x in ["Daily","Several times a week","Weekly"] else x)
modified_df["FrequencyOfPlay"] = pd.Categorical(modified_df["FrequencyOfPlay"],
    categories=["At Least Weekly","Several times a month","Monthly","Every few months","Rarely/Seldom","Never","Prefer not to say"],ordered=True)


features = single_select_features.copy()
features.remove("WWUStatus")
for feature in features:
    print(modified_df[feature].value_counts())
    plt.figure(figsize=(20, 10))
    sns.boxenplot(x=modified_df[feature],y=modified_df["ChessRating"])
    plt.ylim(reversed(plt.ylim()))
    plt.xticks(rotation=90)
    plt.show()
Gender
Woman    109
Man      101
Other     34
Name: count, dtype: int64
No description has been provided for this image
Age
18       69
19       58
20       41
21       28
Other    23
22       15
23       10
Name: count, dtype: int64
No description has been provided for this image
EmploymentStatus
Unemployed            124
Employed part-time    107
Other                  13
Name: count, dtype: int64
No description has been provided for this image
Vision
None                 132
Glasses               76
Both                  25
Contacts              10
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Religiosity
Not Religious    197
Religious         46
Name: count, dtype: int64
No description has been provided for this image
AOSCat
STEM                 95
Other                65
Arts & Humanities    47
Unknown              37
Name: count, dtype: int64
No description has been provided for this image
EnjoysBoardGames
Yes                  231
No                    12
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
BoardGamesOwned
2 to 5               73
5 to 10              60
1 or 2               59
10 to 20             20
0                    19
More than 20         13
Prefer not to say     0
Name: count, dtype: int64
No description has been provided for this image
FrequencyOfPlay
Several times a month    62
At Least Weekly          56
Every few months         54
Monthly                  37
Rarely/Seldom            30
Never                     5
Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Style
Situation-Specific    91
Casual                79
Strategic             42
Party/Social          19
Simple                 8
Not Interested         4
Other                  1
Name: count, dtype: int64
No description has been provided for this image
ChessRating
1          43
7          43
4          26
8          26
5          25
2          23
6          23
3          18
10          9
9           5
Unknown     3
Name: count, dtype: int64
No description has been provided for this image
In [39]:
# Concatenate the boolean columns with the "ChessRating" column
df_bool = cleaned_df[element_bool_features + ['ChessRating']]

# Reshape your DataFrame so that each boolean feature and its corresponding "ChessRating" are in a single row
df_melt = df_bool.melt(id_vars='ChessRating', var_name='Feature', value_name='Value')

# Select only the rows where the category is marked as True
df_melt = df_melt[df_melt['Value'] == 1]

# Now you can plot everything on the same plot
plt.figure(figsize=(20, 10))
sns.boxenplot(x='Feature', y='ChessRating', data=df_melt)
plt.ylim(reversed(plt.ylim()))
plt.xticks(rotation=90)
plt.title('Distribution of ChessRating for Each Feature')
plt.show()
No description has been provided for this image
In [40]:
# Concatenate the boolean columns with the "ChessRating" column
df_bool = cleaned_df[genre_bool_features + ['ChessRating']]

# Reshape your DataFrame so that each boolean feature and its corresponding "ChessRating" are in a single row
df_melt = df_bool.melt(id_vars='ChessRating', var_name='Feature', value_name='Value')

# Select only the rows where the category is marked as True
df_melt = df_melt[df_melt['Value'] == 1]

# Now you can plot everything on the same plot
plt.figure(figsize=(20, 10))
sns.boxenplot(x='Feature', y='ChessRating', data=df_melt)
plt.ylim(reversed(plt.ylim()))
plt.xticks(rotation=90)
plt.title('Distribution of ChessRating for Each Feature')
plt.show()
No description has been provided for this image
In [41]:
# This produces a bunch of plots, but it's hard to interpret as it is currently, and their is not an amazing amount of discovery

# modified_df = cleaned_df.copy()
# modified_df = filter_threshold(modified_df,"Gender",30)
# modified_df = filter_threshold(modified_df,"Age",10)
# modified_df = filter_threshold(modified_df,"EmploymentStatus",30)
# modified_df["Religiosity"] = modified_df["Religiosity"].map(religious_bucketing)
# modified_df = filter_threshold(modified_df,"AOSCat",30)

# features = single_select_features.copy()
# features.remove("WWUStatus")
# for feature in features:
#     print(modified_df[feature].value_counts())
#     bools_against_category(modified_df,element_bool_features,feature)
In [42]:
# This produces a bunch of plots, but it's hard to interpret as it is currently, and their is not an amazing amount of discovery


# modified_df = cleaned_df.copy()
# modified_df = filter_threshold(modified_df,"Gender",30)
# modified_df = filter_threshold(modified_df,"Age",10)
# modified_df = filter_threshold(modified_df,"EmploymentStatus",30)
# modified_df["Religiosity"] = modified_df["Religiosity"].map(religious_bucketing)
# modified_df = filter_threshold(modified_df,"AOSCat",30)

# features = single_select_features.copy()
# features.remove("WWUStatus")
# for feature in features:
#     print(modified_df[feature].value_counts())
#     bools_against_category(modified_df,genre_bool_features,feature)
In [43]:
cleaned_df["BoardGamesOwned"].value_counts()
Out[43]:
BoardGamesOwned
2 to 5               73
5 to 10              60
1 or 2               59
10 to 20             20
0                    19
More than 20         13
Prefer not to say     0
Name: count, dtype: int64
In [44]:
selected_rows = cleaned_df.copy()

bools_against_category(selected_rows,element_bool_features,"BoardGamesOwned")
bools_against_category(selected_rows,genre_bool_features,"BoardGamesOwned")
No description has been provided for this image
No description has been provided for this image
In [45]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"Style",20)
modified_df = modified_df[modified_df["Style"] != "Other"]


print(modified_df["Style"].value_counts())
bools_against_category(modified_df,element_bool_features,"Style")
bools_against_category(modified_df,genre_bool_features,"Style")
Style
Situation-Specific    91
Casual                79
Strategic             42
Name: count, dtype: int64
No description has been provided for this image
No description has been provided for this image
In [46]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"AOSCat",20)
print(modified_df["AOSCat"].value_counts())

features = single_select_features.copy()
features.remove("WWUStatus")
bools_against_category(modified_df,element_bool_features,"AOSCat")
bools_against_category(modified_df,genre_bool_features,"AOSCat")
AOSCat
STEM                 95
Other                65
Arts & Humanities    47
Unknown              37
Name: count, dtype: int64
No description has been provided for this image
No description has been provided for this image

Machine Learning Clustering for Exploratory Analysis¶

In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.calibration import LabelEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier


from sklearn import metrics
from sklearn.cluster import KMeans

data = cleaned_df.copy()
X = data.drop(columns=["EnjoyedBoardGames","AOS"])

onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), ["WWUStatus","EmploymentStatus","Vision","Religiosity","EnjoysBoardGames","Gender","Style","AOSCat"]),
        ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
                                               ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
                                               ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
                                               ["BoardGamesOwned",
                                                "FrequencyOfPlay",
                                                "ChessRating"]),
        ("scale numeric types", StandardScaler(), ["Age"])                      
    ], remainder="passthrough"
    )


X_transformed = onehot_and_ordinal_transform.fit_transform(X)



for i in range(2,10):
    k_means_model = KMeans(n_clusters=i, random_state=5, n_init=10)
    clusters = k_means_model.fit_predict(X_transformed)
    score = metrics.silhouette_score(X_transformed, clusters)
    print(f'Fitting for {i} clusters')
    print(f'score: {score}')
    print()
Fitting for 2 clusters
score: 0.11863466543320195

Fitting for 3 clusters
score: 0.05927103971485946

Fitting for 4 clusters
score: 0.053179399224910384

Fitting for 5 clusters
score: 0.050326176098971546

Fitting for 6 clusters
score: 0.053239186913892206

Fitting for 7 clusters
score: 0.0531029232311401

Fitting for 8 clusters
score: 0.049338481370941906

Fitting for 9 clusters
score: 0.0474473146034939

In [48]:
k_means_model = KMeans(n_clusters=2, random_state=5, n_init=10)
clusters = k_means_model.fit_predict(X_transformed)
score = metrics.silhouette_score(X_transformed, clusters)
print(f'Fitting for {2} clusters')
print(f'score: {score}')
print()
Fitting for 2 clusters
score: 0.11863466543320195

In [49]:
X["Cluster"] = k_means_model.fit_predict(X_transformed)
X
Out[49]:
WWUStatus Gender Age EmploymentStatus Vision Religiosity EnjoysBoardGames BoardGamesOwned FrequencyOfPlay Style ... TransportationIsEnjoyedGenre TravelIsEnjoyedGenre TriviaIsEnjoyedGenre WarIsEnjoyedGenre WordIsEnjoyedGenre WorkerPlacementIsEnjoyedGenre WorldWarIIIsEnjoyedGenre ZombiesIsEnjoyedGenre AOSCat Cluster
0 WWU Student Man 22 Employed part-time None No specific belief No 0 Never Not Interested ... 0 0 0 0 0 0 0 0 STEM 0
1 WWU Student Man 20 Employed part-time Contacts Atheism Yes 2 to 5 Rarely/Seldom Party/Social ... 0 0 0 0 0 0 0 0 STEM 0
2 WWU Student Non-binary 18 Unemployed None Atheism Yes 10 to 20 Several times a week Strategic ... 0 0 0 0 0 0 0 0 STEM 0
3 WWU Student Man 21 Employed part-time None Atheism Yes 10 to 20 Several times a month Strategic ... 0 0 0 1 0 0 1 0 STEM 1
4 WWU Student Man 22 Employed part-time None Christianity Yes More than 20 Weekly Situation-Specific ... 0 0 1 1 1 1 1 1 STEM 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
239 WWU Student Man 19 Employed part-time None No specific belief No 2 to 5 Rarely/Seldom Situation-Specific ... 0 0 0 0 0 0 0 0 STEM 0
240 WWU Student Non-binary 19 Unemployed Glasses Agnosticism Yes 1 or 2 Rarely/Seldom Casual ... 0 0 0 0 0 0 0 0 Social Studies 0
241 WWU Student Non-binary 19 Employed part-time None Agnosticism Yes 2 to 5 Several times a month Situation-Specific ... 0 1 1 0 1 0 0 0 STEM 1
242 WWU Student Man 18 Unemployed Glasses Agnosticism Yes 5 to 10 Several times a week Casual ... 1 0 1 0 1 1 0 0 Arts & Humanities 1
243 WWU Student Man 19 Unemployed Glasses Agnosticism Yes 2 to 5 Rarely/Seldom Party/Social ... 0 0 0 0 0 0 0 0 STEM 0

244 rows × 75 columns

In [50]:
X["Cluster"].value_counts()
Out[50]:
Cluster
0    187
1     57
Name: count, dtype: int64
In [51]:
X["Cluster"].value_counts(normalize=True)
Out[51]:
Cluster
0    0.766393
1    0.233607
Name: proportion, dtype: float64
In [52]:
categories_against_category(X,single_select_features,"Cluster")
Cluster  WWUStatus         
0        WWU Student           184
         WWU Faculty Member      3
1        WWU Student            56
         Neither                 1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Gender                   
0        Woman                        82
         Man                          78
         Non-binary                   17
         Genderfluid                   3
         Prefer not to say             2
         Gender-fluid                  1
         Genderqueer                   1
         Wouldn't you like to know     1
         girl thing                    1
         unsure                        1
1        Woman                        27
         Man                          23
         Non-binary                    6
         Gender queer                  1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Age
0         18    57
          19    38
          20    30
          21    24
          22    11
          23     9
         -1      7
          24     4
          26     3
          25     2
          36     1
          28     1
1         19    20
          18    12
          20    11
          21     4
          22     4
          24     2
          26     1
          23     1
         -1      1
          29     1
Name: count, dtype: int64
No description has been provided for this image
Cluster  EmploymentStatus                 
0        Unemployed                           92
         Employed part-time                   85
         Employed full-time                    3
         Self-employed                         3
         One day a week babysitting            1
         Prefer not to say                     1
         Seasonal worker during the summer     1
         Seeking employment                    1
1        Unemployed                           32
         Employed part-time                   22
         Employed full-time                    1
         Prefer not to say                     1
         Seasonally employed                   1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Vision           
0        None                 108
         Glasses               53
         Both                  16
         Contacts              10
1        None                  24
         Glasses               23
         Both                   9
         Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Religiosity                                                
0        Atheism                                                        46
         No specific belief                                             40
         Agnosticism                                                    37
         Spiritual, not affiliated with a specific religion             27
         Christianity                                                   22
         Judaism                                                         3
         Prefer not to say                                               2
         pagan                                                           1
         Science                                                         1
         being with oneself in connection to everything                  1
         Toaism                                                          1
         Paganism                                                        1
         Pagan                                                           1
         Lutheran                                                        1
         Islam                                                           1
         I believe a god exists but don’t follow any religious texts     1
         Hinduism                                                        1
1        No specific belief                                             15
         Atheism                                                        13
         Agnosticism                                                    12
         Christianity                                                   10
         Spiritual, not affiliated with a specific religion              4
         Judaism                                                         1
         Buddhism                                                        1
         Unitarian                                                       1
Name: count, dtype: int64
No description has been provided for this image
Cluster  AOSCat           
0        STEM                 74
         Arts & Humanities    33
         Unknown              26
         Education            16
         Social Studies       14
         Business              8
         Health & Medicine     8
         Other                 8
1        STEM                 21
         Arts & Humanities    14
         Unknown              11
         Social Studies        5
         Business              2
         Health & Medicine     2
         Other                 2
Name: count, dtype: int64
No description has been provided for this image
Cluster  EnjoysBoardGames 
0        Yes                  175
         No                    11
         Prefer not to say      1
1        Yes                   56
         No                     1
Name: count, dtype: int64
No description has been provided for this image
Cluster  BoardGamesOwned  
0        2 to 5               62
         1 or 2               54
         5 to 10              40
         0                    19
         10 to 20              8
         More than 20          4
         Prefer not to say     0
1        5 to 10              20
         10 to 20             12
         2 to 5               11
         More than 20          9
         1 or 2                5
         Prefer not to say     0
         0                     0
Name: count, dtype: int64
No description has been provided for this image
Cluster  FrequencyOfPlay      
0        Every few months         44
         Several times a month    43
         Monthly                  29
         Rarely/Seldom            28
         Weekly                   23
         Several times a week     10
         Daily                     5
         Never                     5
         Prefer not to say         0
1        Several times a month    19
         Every few months         10
         Several times a week      9
         Monthly                   8
         Weekly                    7
         Rarely/Seldom             2
         Daily                     2
         Never                     0
         Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Cluster  Style             
0        Casual                68
         Situation-Specific    54
         Strategic             33
         Party/Social          19
         Simple                 8
         Not Interested         4
         Other                  1
1        Situation-Specific    37
         Casual                11
         Strategic              9
Name: count, dtype: int64
No description has been provided for this image
Cluster  ChessRating
0        1              37
         7              36
         2              21
         4              18
         8              18
         5              16
         6              16
         3              15
         10              5
         9               4
         Unknown         1
1        5               9
         4               8
         8               8
         6               7
         7               7
         1               6
         10              4
         3               3
         Unknown         2
         2               2
         9               1
Name: count, dtype: int64
No description has been provided for this image
In [53]:
bools_against_category(X,race_bool_features,"Cluster")
bools_against_category(X,element_bool_features,"Cluster")
bools_against_category(X,genre_bool_features,"Cluster")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [54]:
from sklearn.cluster import SpectralClustering

data = cleaned_df.copy()
X = data.drop(columns=["EnjoyedBoardGames","AOS","Age"])

onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), ["WWUStatus","EmploymentStatus","Vision","Religiosity","EnjoysBoardGames","Gender","Style","AOSCat"]),
        ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
                                               ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
                                               ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
                                               ["BoardGamesOwned",
                                                "FrequencyOfPlay",
                                                "ChessRating"])                
    ], remainder="passthrough"
    )


X_transformed = onehot_and_ordinal_transform.fit_transform(X)



for i in range(2,10):
    spectral_clustering_model = SpectralClustering(n_clusters=i, random_state=5, n_init=10)
    clusters = spectral_clustering_model.fit_predict(X_transformed)
    score = metrics.silhouette_score(X_transformed, clusters)
    print(f'Fitting for {i} clusters')
    print(f'score: {score}')
    print()
Fitting for 2 clusters
score: 0.14680586026949505

Fitting for 3 clusters
score: 0.10914232473605325

Fitting for 4 clusters
score: 0.10934645867499984

Fitting for 5 clusters
score: 0.02475783992583868

Fitting for 6 clusters
score: 0.006370108937589488

Fitting for 7 clusters
score: 0.00015076200052050723

Fitting for 8 clusters
score: 0.048111530605883294

Fitting for 9 clusters
score: 0.02940213489175527

In [55]:
spectral_clustering_model = SpectralClustering(n_clusters=2, random_state=5, n_init=10)
clusters = spectral_clustering_model.fit_predict(X_transformed)
score = metrics.silhouette_score(X_transformed, clusters)
print(f'Fitting for {2} clusters')
print(f'score: {score}')
print()
Fitting for 2 clusters
score: 0.14680586026949505

In [56]:
X["Cluster"] = k_means_model.fit_predict(X_transformed)
X
Out[56]:
WWUStatus Gender EmploymentStatus Vision Religiosity EnjoysBoardGames BoardGamesOwned FrequencyOfPlay Style ChessRating ... TransportationIsEnjoyedGenre TravelIsEnjoyedGenre TriviaIsEnjoyedGenre WarIsEnjoyedGenre WordIsEnjoyedGenre WorkerPlacementIsEnjoyedGenre WorldWarIIIsEnjoyedGenre ZombiesIsEnjoyedGenre AOSCat Cluster
0 WWU Student Man Employed part-time None No specific belief No 0 Never Not Interested 3 ... 0 0 0 0 0 0 0 0 STEM 0
1 WWU Student Man Employed part-time Contacts Atheism Yes 2 to 5 Rarely/Seldom Party/Social 6 ... 0 0 0 0 0 0 0 0 STEM 0
2 WWU Student Non-binary Unemployed None Atheism Yes 10 to 20 Several times a week Strategic 8 ... 0 0 0 0 0 0 0 0 STEM 0
3 WWU Student Man Employed part-time None Atheism Yes 10 to 20 Several times a month Strategic 9 ... 0 0 0 1 0 0 1 0 STEM 1
4 WWU Student Man Employed part-time None Christianity Yes More than 20 Weekly Situation-Specific 7 ... 0 0 1 1 1 1 1 1 STEM 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
239 WWU Student Man Employed part-time None No specific belief No 2 to 5 Rarely/Seldom Situation-Specific 6 ... 0 0 0 0 0 0 0 0 STEM 0
240 WWU Student Non-binary Unemployed Glasses Agnosticism Yes 1 or 2 Rarely/Seldom Casual 2 ... 0 0 0 0 0 0 0 0 Social Studies 0
241 WWU Student Non-binary Employed part-time None Agnosticism Yes 2 to 5 Several times a month Situation-Specific 5 ... 0 1 1 0 1 0 0 0 STEM 1
242 WWU Student Man Unemployed Glasses Agnosticism Yes 5 to 10 Several times a week Casual 8 ... 1 0 1 0 1 1 0 0 Arts & Humanities 1
243 WWU Student Man Unemployed Glasses Agnosticism Yes 2 to 5 Rarely/Seldom Party/Social 7 ... 0 0 0 0 0 0 0 0 STEM 0

244 rows × 74 columns

In [57]:
X["Cluster"].value_counts()
Out[57]:
Cluster
0    183
1     61
Name: count, dtype: int64
In [58]:
X["Cluster"].value_counts(normalize=True)
Out[58]:
Cluster
0    0.75
1    0.25
Name: proportion, dtype: float64
In [59]:
features = single_select_features.copy()
features.remove("Age")
categories_against_category(X,features,"Cluster")
Cluster  WWUStatus         
0        WWU Student           180
         WWU Faculty Member      3
1        WWU Student            60
         Neither                 1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Gender                   
0        Woman                        83
         Man                          74
         Non-binary                   16
         Genderfluid                   3
         Prefer not to say             2
         Gender-fluid                  1
         Genderqueer                   1
         Wouldn't you like to know     1
         girl thing                    1
         unsure                        1
1        Man                          27
         Woman                        26
         Non-binary                    7
         Gender queer                  1
Name: count, dtype: int64
No description has been provided for this image
Cluster  EmploymentStatus                 
0        Unemployed                           89
         Employed part-time                   85
         Self-employed                         3
         Employed full-time                    2
         One day a week babysitting            1
         Prefer not to say                     1
         Seasonal worker during the summer     1
         Seeking employment                    1
1        Unemployed                           35
         Employed part-time                   22
         Employed full-time                    2
         Prefer not to say                     1
         Seasonally employed                   1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Vision           
0        None                 107
         Glasses               51
         Both                  15
         Contacts              10
1        Glasses               25
         None                  25
         Both                  10
         Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Religiosity                                                
0        Atheism                                                        44
         No specific belief                                             40
         Agnosticism                                                    36
         Spiritual, not affiliated with a specific religion             26
         Christianity                                                   22
         Judaism                                                         3
         Prefer not to say                                               2
         pagan                                                           1
         Science                                                         1
         being with oneself in connection to everything                  1
         Toaism                                                          1
         Paganism                                                        1
         Pagan                                                           1
         Lutheran                                                        1
         Islam                                                           1
         I believe a god exists but don’t follow any religious texts     1
         Hinduism                                                        1
1        No specific belief                                             15
         Atheism                                                        15
         Agnosticism                                                    13
         Christianity                                                   10
         Spiritual, not affiliated with a specific religion              5
         Judaism                                                         1
         Buddhism                                                        1
         Unitarian                                                       1
Name: count, dtype: int64
No description has been provided for this image
Cluster  AOSCat           
0        STEM                 71
         Arts & Humanities    33
         Unknown              26
         Education            15
         Social Studies       14
         Business              8
         Health & Medicine     8
         Other                 8
1        STEM                 24
         Arts & Humanities    14
         Unknown              11
         Social Studies        5
         Business              2
         Health & Medicine     2
         Other                 2
         Education             1
Name: count, dtype: int64
No description has been provided for this image
Cluster  EnjoysBoardGames 
0        Yes                  171
         No                    11
         Prefer not to say      1
1        Yes                   60
         No                     1
Name: count, dtype: int64
No description has been provided for this image
Cluster  BoardGamesOwned  
0        2 to 5               60
         1 or 2               53
         5 to 10              38
         0                    19
         10 to 20              9
         More than 20          4
         Prefer not to say     0
1        5 to 10              22
         2 to 5               13
         10 to 20             11
         More than 20          9
         1 or 2                6
         Prefer not to say     0
         0                     0
Name: count, dtype: int64
No description has been provided for this image
Cluster  FrequencyOfPlay      
0        Every few months         45
         Several times a month    41
         Monthly                  28
         Rarely/Seldom            27
         Weekly                   22
         Several times a week     10
         Daily                     5
         Never                     5
         Prefer not to say         0
1        Several times a month    21
         Every few months          9
         Several times a week      9
         Monthly                   9
         Weekly                    8
         Rarely/Seldom             3
         Daily                     2
         Never                     0
         Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Cluster  Style             
0        Casual                66
         Situation-Specific    52
         Strategic             33
         Party/Social          19
         Simple                 8
         Not Interested         4
         Other                  1
1        Situation-Specific    39
         Casual                13
         Strategic              9
Name: count, dtype: int64
No description has been provided for this image
Cluster  ChessRating
0        1              36
         7              34
         2              21
         4              17
         5              17
         8              17
         6              16
         3              15
         10              5
         9               4
         Unknown         1
1        4               9
         7               9
         8               9
         5               8
         6               7
         1               7
         10              4
         3               3
         Unknown         2
         2               2
         9               1
Name: count, dtype: int64
No description has been provided for this image
In [60]:
bools_against_category(X,race_bool_features,"Cluster")
bools_against_category(X,element_bool_features,"Cluster")
bools_against_category(X,genre_bool_features,"Cluster")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Machine Learning Predictive Model¶

First Attempt at Machine Learning Model, Predicting Different Board Game Stats Based on Demographics¶

In [61]:
# load dataset
data = cleaned_df.copy()
data = data[['Gender', 'Age', 'AOSCat', 'Religiosity',
       
            "BoardGamesOwned","FrequencyOfPlay","Style","ChessRating",]]

# data = data.drop(columns=["WWUStatus","EmploymentStatus","Vision","Religiosity","AOS","EnjoysBoardGames",
#                           "EnjoyedBoardGames"])



data = filter_threshold(data,"Gender",20)
# data = data[data["Gender"] != "Other"]
#filter_threshold(data,"Age",10)["Age"].value_counts()
# data = filter_threshold
data = filter_threshold(data,"AOSCat",15)
data = filter_threshold(data,"Style",15)

data["FrequencyOfPlay"] = data["FrequencyOfPlay"].apply(lambda x: "At Least Weekly" if x in ["Daily","Several times a week","Weekly"] else x)


data["FrequencyOfPlay"] = pd.Categorical(data["FrequencyOfPlay"],
    categories=["At Least Weekly","Several times a month","Monthly","Every few months","Rarely/Seldom","Never","Prefer not to say"],ordered=True)

data["ChessRating"] = data["ChessRating"].replace('Unknown', np.nan).astype(float)
In [62]:
data = data.dropna(subset=['ChessRating'])
In [63]:
len(data)
Out[63]:
241

Age, Gender, and AOSCat cannot predict ChessRating

In [64]:
from sklearn.linear_model import LinearRegression


X, y = data.drop(columns=["BoardGamesOwned","FrequencyOfPlay","Style","ChessRating"]), data["ChessRating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, stratify=y, random_state=10)


onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), ["Gender","AOSCat","Religiosity" ]),
        # ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
        #                                        ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
        #                                        ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
        #                                        ["BoardGamesOwned",
        #                                         "FrequencyOfPlay",
        #                                         "ChessRating"])                         
    ], remainder="passthrough"
)


linear_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    LinearRegression()
)


dummy_classifier = DummyClassifier(strategy='uniform')


current_pipe = linear_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(dummy_classifier,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
Model Cross Val Scores: [-0.06687053 -0.09886846 -0.28592042]
Dummy Cross Val Scores: [-1.51349528 -1.09700428 -0.95893639]

Age, Gender, and AOSCat cannot predict BoardGamesOwned

In [65]:
data["BoardGamesOwned"]
Out[65]:
0                 0
1            2 to 5
2          10 to 20
3          10 to 20
4      More than 20
           ...     
239          2 to 5
240          1 or 2
241          2 to 5
242         5 to 10
243          2 to 5
Name: BoardGamesOwned, Length: 241, dtype: category
Categories (7, object): ['Prefer not to say' < '0' < '1 or 2' < '2 to 5' < '5 to 10' < '10 to 20' < 'More than 20']
In [66]:
y_encoder = OrdinalEncoder()
y_translated = y_encoder.fit_transform(data["BoardGamesOwned"].values.reshape(-1, 1))

X, y = data.drop(columns=["BoardGamesOwned","FrequencyOfPlay","Style","ChessRating"]), y_translated

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, stratify=y, random_state=20)


onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), ["Gender","AOSCat","Religiosity" ]),
        # ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
        #                                        ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
        #                                        ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
        #                                        ["BoardGamesOwned",
        #                                         "FrequencyOfPlay",
        #                                         "ChessRating"])                         
    ], remainder="passthrough"
)


linear_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    LinearRegression()
)


dummy_classifier = DummyClassifier(strategy='uniform')


current_pipe = linear_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(dummy_classifier,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
Model Cross Val Scores: [-0.3498374  -0.34265183 -0.10069825]
Dummy Cross Val Scores: [-1.93643688 -1.26533166 -1.42929443]
In [67]:
current_pipe.fit(X_train,y_train)
y_encoder.inverse_transform(current_pipe.predict(X_train))
Out[67]:
array([['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['2 to 5'],
       ['2 to 5'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['1 or 2'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['1 or 2'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['1 or 2'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['10 to 20'],
       ['1 or 2'],
       ['10 to 20'],
       ['0'],
       ['5 to 10'],
       ['2 to 5'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['1 or 2'],
       ['2 to 5'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['1 or 2'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['1 or 2'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['1 or 2'],
       ['2 to 5'],
       ['10 to 20'],
       ['1 or 2'],
       ['2 to 5'],
       ['1 or 2'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['10 to 20'],
       ['1 or 2'],
       ['2 to 5'],
       ['1 or 2'],
       ['1 or 2'],
       ['1 or 2'],
       ['1 or 2'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['1 or 2'],
       ['2 to 5'],
       ['0'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['1 or 2'],
       ['1 or 2'],
       ['0'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20'],
       ['10 to 20']], dtype=object)
In [68]:
y_encoder.inverse_transform(y_train)
Out[68]:
array([['2 to 5'],
       ['More than 20'],
       ['5 to 10'],
       ['5 to 10'],
       ['5 to 10'],
       ['1 or 2'],
       ['5 to 10'],
       ['2 to 5'],
       ['2 to 5'],
       ['10 to 20'],
       ['5 to 10'],
       ['5 to 10'],
       ['0'],
       ['5 to 10'],
       ['5 to 10'],
       ['5 to 10'],
       ['5 to 10'],
       ['2 to 5'],
       ['2 to 5'],
       ['1 or 2'],
       ['2 to 5'],
       ['1 or 2'],
       ['5 to 10'],
       ['2 to 5'],
       ['2 to 5'],
       ['2 to 5'],
       ['2 to 5'],
       ['More than 20'],
       ['0'],
       ['5 to 10'],
       ['1 or 2'],
       ['10 to 20'],
       ['5 to 10'],
       ['1 or 2'],
       ['10 to 20'],
       ['5 to 10'],
       ['2 to 5'],
       ['5 to 10'],
       ['2 to 5'],
       ['More than 20'],
       ['2 to 5'],
       ['1 or 2'],
       ['2 to 5'],
       ['0'],
       ['2 to 5'],
       ['10 to 20'],
       ['1 or 2'],
       ['5 to 10'],
       ['1 or 2'],
       ['10 to 20'],
       ['2 to 5'],
       ['2 to 5'],
       ['0'],
       ['5 to 10'],
       ['More than 20'],
       ['0'],
       ['2 to 5'],
       ['10 to 20'],
       ['0'],
       ['2 to 5'],
       ['5 to 10'],
       ['2 to 5'],
       ['1 or 2'],
       ['1 or 2'],
       ['1 or 2'],
       ['2 to 5'],
       ['2 to 5'],
       ['0'],
       ['1 or 2'],
       ['5 to 10'],
       ['2 to 5'],
       ['1 or 2'],
       ['0'],
       ['2 to 5'],
       ['1 or 2'],
       ['5 to 10'],
       ['2 to 5'],
       ['2 to 5'],
       ['5 to 10'],
       ['5 to 10'],
       ['1 or 2'],
       ['More than 20'],
       ['More than 20'],
       ['1 or 2'],
       ['1 or 2'],
       ['2 to 5'],
       ['1 or 2'],
       ['5 to 10'],
       ['1 or 2'],
       ['0'],
       ['5 to 10'],
       ['2 to 5'],
       ['1 or 2'],
       ['1 or 2'],
       ['10 to 20'],
       ['10 to 20'],
       ['2 to 5'],
       ['1 or 2'],
       ['1 or 2'],
       ['2 to 5'],
       ['5 to 10'],
       ['5 to 10'],
       ['More than 20'],
       ['1 or 2'],
       ['5 to 10'],
       ['2 to 5'],
       ['2 to 5'],
       ['1 or 2'],
       ['2 to 5'],
       ['1 or 2'],
       ['2 to 5'],
       ['5 to 10'],
       ['1 or 2'],
       ['1 or 2'],
       ['0'],
       ['5 to 10'],
       ['2 to 5'],
       ['1 or 2'],
       ['10 to 20'],
       ['2 to 5']], dtype=object)
In [69]:
current_pipe.fit(X_train,y_train)

current_pipe.score(X_train,y_train)
Out[69]:
0.2127876515344106

Age, Gender, and AOSCat cannot predict FrequencyOfPlay

In [70]:
y_encoder = OrdinalEncoder()
y_translated = y_encoder.fit_transform(data["FrequencyOfPlay"].values.reshape(-1, 1))

X, y = data.drop(columns=["BoardGamesOwned","FrequencyOfPlay","Style","ChessRating"]), y_translated

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, stratify=y, random_state=20)


onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), ["Gender","AOSCat", "Religiosity"]),
        # ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
        #                                        ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
        #                                        ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
        #                                        ["BoardGamesOwned",
        #                                         "FrequencyOfPlay",
        #                                         "ChessRating"])                         
    ], remainder="passthrough"
)


linear_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    LinearRegression()
)


dummy_classifier = DummyClassifier(strategy='uniform')


current_pipe = linear_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(dummy_classifier,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
Model Cross Val Scores: [-0.23147247 -0.35061096  0.03098003]
Dummy Cross Val Scores: [-0.58434894 -1.52812707 -0.24420913]
In [71]:
y_encoder = OrdinalEncoder()
y_translated = y_encoder.fit_transform(data["FrequencyOfPlay"].values.reshape(-1, 1))

X, y = data.drop(columns=["BoardGamesOwned","FrequencyOfPlay","Style","ChessRating"]), y_translated

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, stratify=y, random_state=20)


onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), ["Gender","AOSCat", "Religiosity"]),
        # ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
        #                                        ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
        #                                        ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
        #                                        ["BoardGamesOwned",
        #                                         "FrequencyOfPlay",
        #                                         "ChessRating"])                         
    ], remainder="passthrough"
)


linear_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    LinearRegression()
)


dummy_classifier = DummyClassifier(strategy='uniform')


current_pipe = linear_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(dummy_classifier,X_train,y_train,cv=3,scoring=make_scorer(r2_score))}")
Model Cross Val Scores: [-0.23147247 -0.35061096  0.03098003]
Dummy Cross Val Scores: [-0.47530468 -1.18398412 -0.79351423]
In [72]:
current_pipe.fit(X_train,y_train)
y_encoder.inverse_transform(current_pipe.predict(X_train))
Out[72]:
array([['Monthly'],
       ['Monthly'],
       ['Rarely/Seldom'],
       ['Monthly'],
       ['Every few months'],
       ['Rarely/Seldom'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Never'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Every few months'],
       ['Monthly'],
       ['Every few months'],
       ['Rarely/Seldom'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['At Least Weekly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Never'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Every few months'],
       ['Never'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Rarely/Seldom'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Never'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly'],
       ['Rarely/Seldom'],
       ['Never'],
       ['Monthly'],
       ['Monthly'],
       ['Every few months'],
       ['Monthly'],
       ['Monthly'],
       ['Monthly']], dtype=object)
In [73]:
y_encoder.inverse_transform(y_train)
Out[73]:
array([['Monthly'],
       ['Several times a month'],
       ['Rarely/Seldom'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Several times a month'],
       ['Several times a month'],
       ['Monthly'],
       ['Every few months'],
       ['At Least Weekly'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Several times a month'],
       ['Every few months'],
       ['Monthly'],
       ['Several times a month'],
       ['Rarely/Seldom'],
       ['Rarely/Seldom'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Monthly'],
       ['Rarely/Seldom'],
       ['Every few months'],
       ['Monthly'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Every few months'],
       ['Rarely/Seldom'],
       ['Several times a month'],
       ['Monthly'],
       ['Rarely/Seldom'],
       ['Several times a month'],
       ['Rarely/Seldom'],
       ['Every few months'],
       ['Monthly'],
       ['At Least Weekly'],
       ['Every few months'],
       ['At Least Weekly'],
       ['Monthly'],
       ['Every few months'],
       ['Every few months'],
       ['Every few months'],
       ['Every few months'],
       ['Rarely/Seldom'],
       ['Every few months'],
       ['At Least Weekly'],
       ['Never'],
       ['Several times a month'],
       ['At Least Weekly'],
       ['Monthly'],
       ['Every few months'],
       ['At Least Weekly'],
       ['Monthly'],
       ['Several times a month'],
       ['Never'],
       ['At Least Weekly'],
       ['At Least Weekly'],
       ['Every few months'],
       ['Every few months'],
       ['Monthly'],
       ['At Least Weekly'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Rarely/Seldom'],
       ['At Least Weekly'],
       ['At Least Weekly'],
       ['At Least Weekly'],
       ['Rarely/Seldom'],
       ['Monthly'],
       ['Several times a month'],
       ['Rarely/Seldom'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Several times a month'],
       ['Every few months'],
       ['Every few months'],
       ['Monthly'],
       ['Every few months'],
       ['Several times a month'],
       ['Several times a month'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Several times a month'],
       ['Every few months'],
       ['At Least Weekly'],
       ['Every few months'],
       ['Every few months'],
       ['Rarely/Seldom'],
       ['At Least Weekly'],
       ['Every few months'],
       ['Never'],
       ['Several times a month'],
       ['Rarely/Seldom'],
       ['Several times a month'],
       ['Every few months'],
       ['Rarely/Seldom'],
       ['Several times a month'],
       ['Rarely/Seldom'],
       ['At Least Weekly'],
       ['Monthly'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Several times a month'],
       ['At Least Weekly'],
       ['At Least Weekly'],
       ['At Least Weekly'],
       ['Several times a month'],
       ['Monthly'],
       ['Every few months'],
       ['Every few months'],
       ['Monthly'],
       ['Several times a month'],
       ['Every few months'],
       ['Every few months'],
       ['Several times a month'],
       ['Monthly'],
       ['Monthly'],
       ['Several times a month']], dtype=object)

Second Attempt at Building a Machine Learning Model, Predict Gender from Board Game Preferences¶

In [74]:
# load dataset
data = cleaned_df.copy()
data = data.drop(columns=["WWUStatus","Age","EmploymentStatus","Vision","Religiosity","AOS","AOSCat","EnjoyedBoardGames"] + race_bool_features)



data = filter_threshold(data,"Gender",40)
data = data[data["Gender"] != "Other"]
# data = filter_threshold(data,"AOSCat",15)
data = filter_threshold(data,"Style",15)
In [75]:
X, y = data.drop(columns=["Gender"]), data["Gender"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, stratify=y, random_state=40)


onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), ["Style","EnjoysBoardGames"]),
        ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
                                               ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
                                               ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
                                               ["BoardGamesOwned",
                                                "FrequencyOfPlay",
                                                "ChessRating"])                         
    ], remainder="passthrough"
)


logistic_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    LogisticRegression(max_iter=10000)
)

k_neighbors_classifier_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    KNeighborsClassifier()
)

decision_tree_classifier_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    DecisionTreeClassifier()
)

random_forest_classifier_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    RandomForestClassifier(random_state=40)
)

baseline_most_frequent = DummyClassifier(strategy='most_frequent')
baseline_uniform = DummyClassifier(strategy='uniform',random_state=40)


current_pipe = logistic_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_most_frequent,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_uniform,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
Model Cross Val Scores: [0.76190476 0.66666667 0.69047619 0.80952381]
Dummy Cross Val Scores: [0.52380952 0.52380952 0.52380952 0.5       ]
Dummy Cross Val Scores: [0.57142857 0.52380952 0.52380952 0.5       ]
In [76]:
current_pipe = k_neighbors_classifier_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_most_frequent,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_uniform,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
Model Cross Val Scores: [0.54761905 0.5952381  0.64285714 0.5952381 ]
Dummy Cross Val Scores: [0.52380952 0.52380952 0.52380952 0.5       ]
Dummy Cross Val Scores: [0.57142857 0.52380952 0.52380952 0.5       ]
In [77]:
current_pipe = decision_tree_classifier_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_most_frequent,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_uniform,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
Model Cross Val Scores: [0.61904762 0.47619048 0.61904762 0.54761905]
Dummy Cross Val Scores: [0.52380952 0.52380952 0.52380952 0.5       ]
Dummy Cross Val Scores: [0.57142857 0.52380952 0.52380952 0.5       ]
In [78]:
current_pipe = random_forest_classifier_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_most_frequent,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(baseline_uniform,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
Model Cross Val Scores: [0.5952381  0.57142857 0.69047619 0.71428571]
Dummy Cross Val Scores: [0.52380952 0.52380952 0.52380952 0.5       ]
Dummy Cross Val Scores: [0.57142857 0.52380952 0.52380952 0.5       ]
In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in Random Forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier(random_state=40)

random_forest_randomized_search_cv = make_pipeline(
    onehot_and_ordinal_transform,
    RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 4, verbose=2, random_state=42, n_jobs = -1)
)

random_forest_randomized_search_cv.fit(X_train, y_train)
Fitting 4 folds for each of 100 candidates, totalling 400 fits
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1800; total time=   3.2s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=90, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.8s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.4s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.7s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=2, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   1.7s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=5, n_estimators=1400; total time=   1.9s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.1s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   2.1s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   1.0s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   2.6s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.4s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.7s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=4, min_samples_split=10, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   2.6s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=   3.2s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=90, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   3.0s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=1600; total time=   2.1s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.7s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=1400; total time=   2.5s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.4s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   1.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=2, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   1.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=5, n_estimators=1400; total time=   1.9s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   1.1s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1800; total time=   3.2s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.6s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.8s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=4, min_samples_split=10, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   2.9s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=90, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=1600; total time=   2.1s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.8s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.6s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=1400; total time=   2.5s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=2, n_estimators=1200; total time=   2.0s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   1.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=5, n_estimators=1400; total time=   1.9s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.1s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   2.1s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   1.0s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.6s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.4s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.7s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=4, min_samples_split=10, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   2.6s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=   3.2s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=False, max_depth=90, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   2.3s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   2.9s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=90, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=1600; total time=   2.1s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.8s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=1400; total time=   2.5s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.6s
[CV] END bootstrap=False, max_depth=90, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=5, n_estimators=1200; total time=   1.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   1.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=2, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   1.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=1, min_samples_split=5, n_estimators=1400; total time=   1.9s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=False, max_depth=70, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=False, max_depth=110, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   0.8s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   1.0s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=80, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.1s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.6s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   3.6s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.1s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.7s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=4, min_samples_split=10, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   2.6s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=False, max_depth=80, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   1.6s
[CV] END bootstrap=True, max_depth=110, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=   3.1s
[CV] END bootstrap=False, max_depth=60, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   1.3s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=70, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=   2.4s
[CV] END bootstrap=True, max_depth=60, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.6s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=4, min_samples_split=2, n_estimators=1400; total time=   1.8s
[CV] END bootstrap=False, max_depth=90, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   2.4s
[CV] END bootstrap=True, max_depth=100, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   3.5s
Out[79]:
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Style',
                                                   'EnjoysBoardGames']),
                                                 ('ordinal',
                                                  Pipeline(steps=[('ordinalencoder',
                                                                   OrdinalEncoder(categories=[['Prefer '
                                                                                               'not '
                                                                                               'to '
                                                                                               'say',
                                                                                               '0',
                                                                                               '1 '
                                                                                               'or '
                                                                                               '2',
                                                                                               '2 '
                                                                                               'to '
                                                                                               '5',
                                                                                               '5 '
                                                                                               'to '
                                                                                               '10',
                                                                                               '10 '
                                                                                               'to '
                                                                                               '20',
                                                                                               'More '
                                                                                               'than '
                                                                                               '20'],
                                                                                              ['Daily',
                                                                                               'Sev...
                ('randomizedsearchcv',
                 RandomizedSearchCV(cv=4,
                                    estimator=RandomForestClassifier(random_state=40),
                                    n_iter=100, n_jobs=-1,
                                    param_distributions={'bootstrap': [True,
                                                                       False],
                                                         'max_depth': [10, 20,
                                                                       30, 40,
                                                                       50, 60,
                                                                       70, 80,
                                                                       90, 100,
                                                                       110,
                                                                       None],
                                                         'min_samples_leaf': [1,
                                                                              2,
                                                                              4],
                                                         'min_samples_split': [2,
                                                                               5,
                                                                               10],
                                                         'n_estimators': [200,
                                                                          400,
                                                                          600,
                                                                          800,
                                                                          1000,
                                                                          1200,
                                                                          1400,
                                                                          1600,
                                                                          1800,
                                                                          2000]},
                                    random_state=42, verbose=2))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Style',
                                                   'EnjoysBoardGames']),
                                                 ('ordinal',
                                                  Pipeline(steps=[('ordinalencoder',
                                                                   OrdinalEncoder(categories=[['Prefer '
                                                                                               'not '
                                                                                               'to '
                                                                                               'say',
                                                                                               '0',
                                                                                               '1 '
                                                                                               'or '
                                                                                               '2',
                                                                                               '2 '
                                                                                               'to '
                                                                                               '5',
                                                                                               '5 '
                                                                                               'to '
                                                                                               '10',
                                                                                               '10 '
                                                                                               'to '
                                                                                               '20',
                                                                                               'More '
                                                                                               'than '
                                                                                               '20'],
                                                                                              ['Daily',
                                                                                               'Sev...
                ('randomizedsearchcv',
                 RandomizedSearchCV(cv=4,
                                    estimator=RandomForestClassifier(random_state=40),
                                    n_iter=100, n_jobs=-1,
                                    param_distributions={'bootstrap': [True,
                                                                       False],
                                                         'max_depth': [10, 20,
                                                                       30, 40,
                                                                       50, 60,
                                                                       70, 80,
                                                                       90, 100,
                                                                       110,
                                                                       None],
                                                         'min_samples_leaf': [1,
                                                                              2,
                                                                              4],
                                                         'min_samples_split': [2,
                                                                               5,
                                                                               10],
                                                         'n_estimators': [200,
                                                                          400,
                                                                          600,
                                                                          800,
                                                                          1000,
                                                                          1200,
                                                                          1400,
                                                                          1600,
                                                                          1800,
                                                                          2000]},
                                    random_state=42, verbose=2))])
ColumnTransformer(remainder='passthrough',
                  transformers=[('onehot',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['Style', 'EnjoysBoardGames']),
                                ('ordinal',
                                 Pipeline(steps=[('ordinalencoder',
                                                  OrdinalEncoder(categories=[['Prefer '
                                                                              'not '
                                                                              'to '
                                                                              'say',
                                                                              '0',
                                                                              '1 '
                                                                              'or '
                                                                              '2',
                                                                              '2 '
                                                                              'to '
                                                                              '5',
                                                                              '5 '
                                                                              'to '
                                                                              '10',
                                                                              '10 '
                                                                              'to '
                                                                              '20',
                                                                              'More '
                                                                              'than '
                                                                              '20'],
                                                                             ['Daily',
                                                                              'Several '
                                                                              'times '
                                                                              'a '
                                                                              'week',
                                                                              'Weekly',
                                                                              'Several '
                                                                              'times '
                                                                              'a '
                                                                              'month',
                                                                              'Monthly',
                                                                              'Every '
                                                                              'few '
                                                                              'months',
                                                                              'Rarely/Seldom',
                                                                              'Never',
                                                                              'Prefer '
                                                                              'not '
                                                                              'to '
                                                                              'say'],
                                                                             ['Unknown',
                                                                              '1',
                                                                              '2',
                                                                              '3',
                                                                              '4',
                                                                              '5',
                                                                              '6',
                                                                              '7',
                                                                              '8',
                                                                              '9',
                                                                              '10']])),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['BoardGamesOwned', 'FrequencyOfPlay',
                                  'ChessRating'])])
['Style', 'EnjoysBoardGames']
OneHotEncoder(handle_unknown='ignore')
['BoardGamesOwned', 'FrequencyOfPlay', 'ChessRating']
OrdinalEncoder(categories=[['Prefer not to say', '0', '1 or 2', '2 to 5',
                            '5 to 10', '10 to 20', 'More than 20'],
                           ['Daily', 'Several times a week', 'Weekly',
                            'Several times a month', 'Monthly',
                            'Every few months', 'Rarely/Seldom', 'Never',
                            'Prefer not to say'],
                           ['Unknown', '1', '2', '3', '4', '5', '6', '7', '8',
                            '9', '10']])
StandardScaler()
['Conflict/CompetitionIsPreferredElement', 'CooperationIsPreferredElement', 'Heavy/ImmersiveThemingIsPreferredElement', 'LuckIsPreferredElement', 'Party/Low-StakesIsPreferredElement', 'Puzzle-SolvingIsPreferredElement', 'SocialDeduction/HiddenRoleIsPreferredElement', 'StrategyIsPreferredElement', 'TriviaIsPreferredElement', 'AbstractStrategyIsEnjoyedGenre', 'AdventureIsEnjoyedGenre', 'AnimalsIsEnjoyedGenre', 'AuctionIsEnjoyedGenre', 'CardIsEnjoyedGenre', 'CardDraftingIsEnjoyedGenre', 'CivilizationIsEnjoyedGenre', 'CooperativeIsEnjoyedGenre', 'Deck-BuildingIsEnjoyedGenre', 'DeductionIsEnjoyedGenre', 'EconomicIsEnjoyedGenre', 'EducationalIsEnjoyedGenre', 'ExplorationIsEnjoyedGenre', 'FantasyIsEnjoyedGenre', 'FarmingIsEnjoyedGenre', 'FightingIsEnjoyedGenre', 'HorrorIsEnjoyedGenre', 'LuckIsEnjoyedGenre', 'MedievalIsEnjoyedGenre', 'MemoryIsEnjoyedGenre', 'MiniaturesIsEnjoyedGenre', 'PartyIsEnjoyedGenre', 'PiratesIsEnjoyedGenre', 'PoliticalIsEnjoyedGenre', 'PuzzleIsEnjoyedGenre', 'RacingIsEnjoyedGenre', 'Role-PlayingIsEnjoyedGenre', 'RollandMoveIsEnjoyedGenre', 'ScienceFictionIsEnjoyedGenre', 'SocialDeduction/HiddenRoleIsEnjoyedGenre', 'SportsIsEnjoyedGenre', 'StrategyIsEnjoyedGenre', 'TerritoryBuildingIsEnjoyedGenre', 'Tile-LayingIsEnjoyedGenre', 'TrainsIsEnjoyedGenre', 'TransportationIsEnjoyedGenre', 'TravelIsEnjoyedGenre', 'TriviaIsEnjoyedGenre', 'WarIsEnjoyedGenre', 'WordIsEnjoyedGenre', 'WorkerPlacementIsEnjoyedGenre', 'WorldWarIIIsEnjoyedGenre', 'ZombiesIsEnjoyedGenre']
passthrough
RandomizedSearchCV(cv=4, estimator=RandomForestClassifier(random_state=40),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)
RandomForestClassifier(random_state=40)
RandomForestClassifier(random_state=40)
In [80]:
cv_results = random_forest_randomized_search_cv.named_steps['randomizedsearchcv'].cv_results_
In [81]:
# for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
#     print(params, 'has a cross-validated score of', mean_score)
In [82]:
# random_forest_randomized_search_cv.named_steps['randomizedsearchcv'].best_params_
In [83]:
random_forest_randomized_search_cv.named_steps['randomizedsearchcv'].best_score_
Out[83]:
0.7202380952380953
In [84]:
conf_matrix = confusion_matrix(y_test, random_forest_randomized_search_cv.predict(X_test),labels=["Man","Woman",])

print(conf_matrix)

print(f"Model Accuracy Score: {accuracy_score(random_forest_randomized_search_cv.predict(X_test),y_test)}")



baseline_most_frequent.fit(X_train, y_train)
y_pred_most_frequent = baseline_most_frequent.predict(X_test)

baseline_uniform.fit(X_train,y_train)
y_pred_uniform = baseline_uniform.predict(X_test)

print(f"Baseline Most Frequent Score: {accuracy_score(y_test, y_pred_most_frequent)}")
print(f"Baseline Uniform Score: {accuracy_score(y_test, y_pred_uniform)}")
[[15  5]
 [ 4 18]]
Model Accuracy Score: 0.7857142857142857
Baseline Most Frequent Score: 0.5238095238095238
Baseline Uniform Score: 0.42857142857142855
In [85]:
# logistic_pipe.fit(X_train,y_train)

# conf_matrix = confusion_matrix(y_test, logistic_pipe.predict(X_test),labels=["Man","Woman"])

# print(conf_matrix)

# print(f"Model Accuracy Score: {accuracy_score(logistic_pipe.predict(X_test),y_test)}")



# baseline_most_frequent.fit(X_train, y_train)
# y_pred_most_frequent = baseline_most_frequent.predict(X_test)

# baseline_uniform.fit(X_train,y_train)
# y_pred_uniform = baseline_uniform.predict(X_test)

# print(f"Baseline Most Frequent Score: {accuracy_score(y_test, y_pred_most_frequent)}")
# print(f"Baseline Uniform Score: {accuracy_score(y_test, y_pred_uniform)}")